# Setting up environment and Loading data from json file into dataframe

In [4]:
# Finally, setup our Spark session 
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [5]:
# spark is an existing SparkSession 
spark 

In [6]:
# import packages 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import seaborn as sns
from pyspark.sql.functions import *
from pyspark.sql.functions import col, trim, length, to_date, year, month, collect_list, max
from pyspark.sql import functions as F 

In [7]:
# Load the data into DataFrames
df_lighter_books = spark.read.json("lighter_books.json")
df_lighter_authors = spark.read.json("lighter_authors.json")
df_list = spark.read.json("list.json")

# [RQ1] Exploratory Data Analysis (EDA)

In [6]:
# Print the schema of the dataframe 
df_lighter_books.printSchema() 

root
 |-- asin: string (nullable = true)
 |-- author_id: long (nullable = true)
 |-- author_name: string (nullable = true)
 |-- authors: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: string (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- role: string (nullable = true)
 |-- average_rating: double (nullable = true)
 |-- description: string (nullable = true)
 |-- edition_information: string (nullable = true)
 |-- format: string (nullable = true)
 |-- id: long (nullable = true)
 |-- image_url: string (nullable = true)
 |-- isbn: string (nullable = true)
 |-- isbn13: string (nullable = true)
 |-- language: string (nullable = true)
 |-- num_pages: long (nullable = true)
 |-- original_publication_date: string (nullable = true)
 |-- publication_date: string (nullable = true)
 |-- publisher: string (nullable = true)
 |-- rating_dist: string (nullable = true)
 |-- ratings_count: long (nullable = true)
 |-- series_id: str

In [7]:
# Print the schema of the dataframe 
df_lighter_authors.printSchema()

root
 |-- about: string (nullable = true)
 |-- average_rating: double (nullable = true)
 |-- book_ids: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- fans_count: long (nullable = true)
 |-- gender: string (nullable = true)
 |-- id: long (nullable = true)
 |-- image_url: string (nullable = true)
 |-- name: string (nullable = true)
 |-- ratings_count: long (nullable = true)
 |-- text_reviews_count: long (nullable = true)
 |-- work_ids: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- works_count: long (nullable = true)



In [8]:
# Print the schema of the dataframe 
df_list.printSchema()

root
 |-- books: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- author: string (nullable = true)
 |    |    |-- author_id: string (nullable = true)
 |    |    |-- book_id: string (nullable = true)
 |    |    |-- position: struct (nullable = true)
 |    |    |    |-- ranking: long (nullable = true)
 |    |    |    |-- score: long (nullable = true)
 |    |    |    |-- votes: long (nullable = true)
 |    |    |-- title: string (nullable = true)
 |-- created_by: struct (nullable = true)
 |    |-- id: string (nullable = true)
 |    |-- name: string (nullable = true)
 |-- created_date: string (nullable = true)
 |-- description: string (nullable = true)
 |-- description_html: string (nullable = true)
 |-- id: string (nullable = true)
 |-- num_books: long (nullable = true)
 |-- num_comments: long (nullable = true)
 |-- num_likes: long (nullable = true)
 |-- num_pages: long (nullable = true)
 |-- num_voters: long (nullable = true)
 |-- tags: array (nullab

In [9]:
# Print the first 5 rows of the dataframe 
df_lighter_books.show(5)

+----+---------+---------------+--------------------+--------------+--------------------+-------------------+---------+-----+--------------------+----------+-------------+--------+---------+-------------------------+----------------+-----------------+--------------------+-------------+---------+-----------+---------------+---------------+------------------+--------------------+-------+
|asin|author_id|    author_name|             authors|average_rating|         description|edition_information|   format|   id|           image_url|      isbn|       isbn13|language|num_pages|original_publication_date|publication_date|        publisher|         rating_dist|ratings_count|series_id|series_name|series_position|        shelves|text_reviews_count|               title|work_id|
+----+---------+---------------+--------------------+--------------+--------------------+-------------------+---------+-----+--------------------+----------+-------------+--------+---------+-------------------------+------

In [10]:
# Print the first 5 rows of the dataframe
df_lighter_authors.show(5)

+--------------------+--------------+--------------------+----------+------+--------+--------------------+--------------------+-------------+------------------+--------------------+-----------+
|               about|average_rating|            book_ids|fans_count|gender|      id|           image_url|                name|ratings_count|text_reviews_count|            work_ids|works_count|
+--------------------+--------------+--------------------+----------+------+--------+--------------------+--------------------+-------------+------------------+--------------------+-----------+
|<i>Librarian Note...|          2.39|[37639181, 351301...|         1|      |16850813|https://s.gr-asse...|          Adil Aijaz|           18|                 2|[59226555, 564512...|          4|
|American ambassad...|          3.06|[290618, 735481, ...|         0|  male|   84378|https://s.gr-asse...|Thomas Patrick Me...|           18|                 5|[281951, 721665, ...|          9|
|Robert Mailer And...|        

In [11]:
# Print the first 5 rows of the dataframe
df_list.show(5)

+--------------------+-----------------+-------------------+--------------------+--------------------+----+---------+------------+---------+---------+----------+--------------------+--------------------+
|               books|       created_by|       created_date|         description|    description_html|  id|num_books|num_comments|num_likes|num_pages|num_voters|                tags|               title|
+--------------------+-----------------+-------------------+--------------------+--------------------+----+---------+------------+---------+---------+----------+--------------------+--------------------+
|[{Thomas Malory, ...| {, deleted user}|    June 25th, 2008|The best books pu...|\n      The best ...|  74|       57|           4|      101|        1|       144|[15th-century, be...|Best Books of the...|
|[{Suzanne Collins...| {, deleted user}|  October 1st, 2008|         YA Fiction!|\n      YA Fictio...| 881|     1862|          19|       72|       19|      3044|[fiction, young-a...|Mu

In [12]:
# Print the head of dataframe
df_lighter_books.head()

Row(asin='', author_id=1077326, author_name='J.K. Rowling', authors=[Row(id='1077326', name='J.K. Rowling', role=''), Row(id='2927', name='Mary GrandPré', role='Illustrator')], average_rating=4.5, description='There is a door at the end of a silent corridor. And it’s haunting Harry Pottter’s dreams. Why else would he be waking in the middle of the night, screaming in terror?<br /><br />Harry has a lot on his mind for this, his fifth year at Hogwarts: a Defense Against the Dark Arts teacher with a personality like poisoned honey; a big surprise on the Gryffindor Quidditch team; and the looming terror of the Ordinary Wizarding Level exams. But all these things pale next to the growing threat of He-Who-Must-Not-Be-Named - a threat that neither the magical government nor the authorities at Hogwarts can stop.<br /><br />As the grasp of darkness tightens, Harry must discover the true depth and strength of his friends, the importance of boundless loyalty, and the shocking price of unbearable 

In [13]:
# Print the head of dataframe
df_lighter_authors.head()

Row(about='Douglas Noël Adams was an English author, comic radio dramatist, and musician. He is best known as the author of the <i>\n  <a href="https://www.goodreads.com/book/show/11.Hitchhiker_s_Guide_to_the_Galaxy" title="Hitchhiker\'s Guide to the Galaxy" rel="nofollow noopener">Hitchhiker\'s Guide to the Galaxy</a>\n</i> series. Hitchhiker\'s began on radio, and developed into a "trilogy" of five books (which sold more than fifteen million copies during his lifetime) as well as a television series, a comic book series, a computer game, and a feature film that was completed after Adams\' death. The series has also been adapted for live theatre using various scripts; the earliest such productions used material newly written by Adams. He was known to some fans as Bop Ad (after his illegible signature), or by his initials "DNA".<br /><br />In addition to <i>The Hitchhiker\'s Guide to the Galaxy</i>, Douglas Adams wrote or co-wrote three stories of the science fiction television series 

In [35]:
# Print the head of dataframe
df_list.head()




In [15]:
# summary statistics of the dataframe
df_lighter_books.describe().show()

+-------+--------+------------------+-------------------+------------------+--------------------+--------------------+------------------+--------------------+--------------------+--------+--------------------+--------+------------------+-------------------------+------------------+----------------------+--------------------+------------------+------------------+-----------------------------------+-----------------+------------------+-------+--------------------+
|summary|    asin|         author_id|        author_name|    average_rating|         description| edition_information|            format|                  id|           image_url|    isbn|              isbn13|language|         num_pages|original_publication_date|  publication_date|             publisher|         rating_dist|     ratings_count|         series_id|                        series_name|  series_position|text_reviews_count|  title|             work_id|
+-------+--------+------------------+-------------------+---------

In [16]:
# summary statistics of the dataframe
df_lighter_authors.describe().show()

+-------+--------------------+------------------+------------------+---------+------------------+--------------------+-------------------+-----------------+------------------+------------------+
|summary|               about|    average_rating|        fans_count|   gender|                id|           image_url|               name|    ratings_count|text_reviews_count|       works_count|
+-------+--------------------+------------------+------------------+---------+------------------+--------------------+-------------------+-----------------+------------------+------------------+
|  count|              351767|            351767|            351767|   351767|            351767|              351767|             351767|           351767|            351767|            351767|
|   mean|   1752.326923076923|3.6511943132812488|111.61573143586521|     NULL|7751861.1911975825|                NULL|           Infinity|4770.586308550831| 330.9932426862099|25.937137366495435|
| stddev|  437.4607532493

In [17]:
# summary statistics of the dataframe
df_list.describe().show()

+-------+-------------------+--------------------+--------------------+-----------------+------------------+------------------+------------------+------------------+-----------------+--------------------+
|summary|       created_date|         description|    description_html|               id|         num_books|      num_comments|         num_likes|         num_pages|       num_voters|               title|
+-------+-------------------+--------------------+--------------------+-----------------+------------------+------------------+------------------+------------------+-----------------+--------------------+
|  count|              81511|               81512|               81512|            81512|             81512|             81512|             81512|             81512|            81512|               81512|
|   mean|               NULL|               503.0|               503.0|82154.03516046717| 89.25469869467072| 1.515641868681912|15.345078025321426|1.5542742172931594|74.437211698890

In [18]:
# columns of the dataframe
df_lighter_books.columns

['asin',
 'author_id',
 'author_name',
 'authors',
 'average_rating',
 'description',
 'edition_information',
 'format',
 'id',
 'image_url',
 'isbn',
 'isbn13',
 'language',
 'num_pages',
 'original_publication_date',
 'publication_date',
 'publisher',
 'rating_dist',
 'ratings_count',
 'series_id',
 'series_name',
 'series_position',
 'shelves',
 'text_reviews_count',
 'title',
 'work_id']

In [19]:
# columns of the dataframe
df_lighter_authors.columns

['about',
 'average_rating',
 'book_ids',
 'fans_count',
 'gender',
 'id',
 'image_url',
 'name',
 'ratings_count',
 'text_reviews_count',
 'work_ids',
 'works_count']

In [20]:
# columns of the dataframe
df_list.columns

['books',
 'created_by',
 'created_date',
 'description',
 'description_html',
 'id',
 'num_books',
 'num_comments',
 'num_likes',
 'num_pages',
 'num_voters',
 'tags',
 'title']

In [21]:
# Count the number of rows in the dataframe
df_lighter_books.count()

7027431

In [22]:
# Count the number of rows in the dataframe
df_lighter_authors.count()

351767

In [23]:
# Count the number of rows in the dataframe
df_list.count()

81512

In [24]:
# Count the number of distinct rows in the dataframe
df_lighter_books.distinct().count()

7027431

In [25]:
# Count the number of distinct rows in the dataframe
df_lighter_authors.distinct().count()

351767

In [26]:
# Count the number of distinct rows in the dataframe
df_list.distinct().count()

81512

-----------------------------------------------

# [RQ3] Let’s have a historical look at the dataset!

## 1. Write a function that takes as input a year and returns as output the following information:

- The number of books published that year.

- The total number of pages written that year.

- The most prolific month of that year.

- The longest book written that year.

In [9]:
df_lighter_books = df_lighter_books.filter(trim(col('original_publication_date')) != '') # removes rows with empty original_publication_date
df_lighter_books = df_lighter_books.filter(trim(col('num_pages')) != '') # removes rows with empty num_pages
df_lighter_books = df_lighter_books.filter(trim(col('title')) != '') # removes rows with empty title

In [10]:
df_lighter_books = df_lighter_books.dropna(subset=['original_publication_date', 'num_pages','title']) # removes rows with null values in original_publication_date, num_pages, and title

In [11]:
# Define the regular expression for date
date_regex = r"^(19\d\d|200[0-2]|20(0[3-9]|1\d|2[0-3]))-(0[1-9]|1[012])-(0[1-9]|1[0-9]|2[0-9]|3[01])$"

# Validate the original_publication_date column
df_lighter_books = df_lighter_books.withColumn(
    "is_valid_date",
    F.when(F.col("original_publication_date").rlike(date_regex), True).otherwise(False)
)

# Filter the DataFrame to keep only rows with valid dates
df_lighter_books = df_lighter_books.filter(F.col("is_valid_date") == True)

# Drop the is_valid_date column
df_lighter_books = df_lighter_books.drop("is_valid_date")

In [12]:
# Attempt to convert original_publication_date to a date
df_lighter_books = df_lighter_books.withColumn("converted_date", to_date(col("original_publication_date"), 'yyyy-MM-dd'))

# Filter out rows where the conversion resulted in null
df_lighter_books = df_lighter_books.filter(col("converted_date").isNotNull())

# Drop the intermediate converted_date column
df_lighter_books = df_lighter_books.drop("converted_date")

In [13]:
df_lighter_books = df_lighter_books.withColumn("title", trim(df_lighter_books.title))  # removes leading and trailing white spaces

In [14]:
# Filter for valid num_pages
df_lighter_books = df_lighter_books.filter(col('num_pages') > 0)

# Select the relevant columns
df_lighter_books = df_lighter_books.select('original_publication_date', 'num_pages', 'title')

In [15]:
df_lighter_books.count() # count the number of rows

2390718

In [16]:
df_lighter_books.show(5) # show the first 5 rows

+-------------------------+---------+--------------------+
|original_publication_date|num_pages|               title|
+-------------------------+---------+--------------------+
|               2003-06-21|      870|Harry Potter and ...|
|               1997-06-26|      309|Harry Potter and ...|
|               1998-07-02|      352|Harry Potter and ...|
|               1999-07-08|      435|Harry Potter and ...|
|               2000-07-08|      734|Harry Potter and ...|
+-------------------------+---------+--------------------+
only showing top 5 rows



In [17]:
df_lighter_books = df_lighter_books.withColumn('year', year('original_publication_date')) # add a year column
df_lighter_books = df_lighter_books.withColumn('month', month('original_publication_date')) # add a month column

In [18]:
df_lighter_books.show(5) # show the first 5 rows

+-------------------------+---------+--------------------+----+-----+
|original_publication_date|num_pages|               title|year|month|
+-------------------------+---------+--------------------+----+-----+
|               2003-06-21|      870|Harry Potter and ...|2003|    6|
|               1997-06-26|      309|Harry Potter and ...|1997|    6|
|               1998-07-02|      352|Harry Potter and ...|1998|    7|
|               1999-07-08|      435|Harry Potter and ...|1999|    7|
|               2000-07-08|      734|Harry Potter and ...|2000|    7|
+-------------------------+---------+--------------------+----+-----+
only showing top 5 rows



In [19]:
df_lighter_books.select("year").distinct().count() # count the number of distinct years

124

In [20]:
max_year = df_lighter_books.agg(max("year")).first()[0] # find the maximum year
print(max_year)

2023


In [21]:
min_year = df_lighter_books.agg(min("year")).first()[0] # find the minimum year
print(min_year)

1900


### 1.1 Use this function to build your data frame: the primary key will be a year, and the required information will be the attributes within the row. Finally, show the head and the tail of this new data frame considering the first ten years registered and the last ten years.

`yearly_stats` function to get yearly stats for each year in the `df_lighter_books` dataframe

In [39]:
def yearly_stats(year):
    # Filter books for given year
    df_year = df_lighter_books.filter(df_lighter_books['year'] == year)

    # Number of books published
    num_books = df_year.count()

    # Total number of pages
    total_pages = df_year.agg(F.sum('num_pages')).first()[0]

    # Most prolific month
    prolific_month = df_year.groupBy('month').count().orderBy(F.desc('count')).first()[0]

    # Longest book
    longest_book = df_year.orderBy(F.desc('num_pages')).first()['title']

    return (year, num_books, total_pages, prolific_month, longest_book)

In [41]:
# Get list of unique years

years = df_lighter_books.select('year').toPandas()['year'].unique().tolist()

In [42]:
years.sort() # sort the list (not necessary)

In [50]:
data_first_10 = [yearly_stats(year) for year in years[:10]] # get the yearly stats for the first 10 years

In [51]:
data_last_10 = [yearly_stats(year) for year in years[-10:]] # get the yearly stats for the last 10 years

In [52]:
data_first_10 # print the yearly stats for the first 10 years

[(1900, 1747, 509501, 1, 'Complete Works of Joseph Conrad'),
 (1901, 802, 351965, 1, 'NKJV Study Bible'),
 (1902, 882, 696460, 1, 'Holy Bible: NLT - New Living Translation'),
 (1903, 733, 183484, 1, 'The Life of William Ewart Gladstone - Vol. I'),
 (1904, 628, 162193, 1, 'The Life Recovery Bible NLT'),
 (1905, 1119, 282252, 1, 'Dictionary of the Bible'),
 (1906, 518, 145632, 1, "Moody's Magazine Vol 1 - 20"),
 (1907, 558, 137774, 1, 'Arsenio Lupin, Caballero Ladrón'),
 (1908, 430, 97105, 1, 'Anne of Green Gables--The Complete Collection'),
 (1909, 1041, 298330, 1, 'The Works of Rudyard Kipling, 10 Vols')]

In [53]:
data_last_10   # print the yearly stats for the last 10 years

[(2014,
  177582,
  42919290,
  1,
  'A Most Unlikely Countess (To Love a Wildcat, #2)'),
 (2015,
  53821,
  14013926,
  1,
  'Revel for the American Nation: A History of the United States, Combined Volume -- Access Card'),
 (2016, 2299, 708061, 1, 'Homestuck'),
 (2017, 491, 150653, 2, 'The Starfarers Quartet'),
 (2018, 192, 72498, 1, '地海六部曲'),
 (2019, 118, 34813, 8, 'Nouvelles Les'),
 (2020, 83, 27477, 1, 'The Complete Ripley Novels (Ripley, #1-5)'),
 (2021,
  17,
  8088,
  8,
  "The Navigator's Children (The Last King of Osten Ard, #3)"),
 (2022,
  4,
  1280,
  10,
  'Highland Ever After (The Montgomerys and Armstrongs, #3)'),
 (2023, 1, 463, 9, 'Apocalypse')]

In [56]:
data_first_10 = pd.DataFrame(data_first_10, columns =['year', 'num_books', 'total_pages', 'prolific_month', 'longest_book']) # convert the list to a pandas dataframe

In [57]:
data_first_10   # print the dataframe

Unnamed: 0,year,num_books,total_pages,prolific_month,longest_book
0,1900,1747,509501,1,Complete Works of Joseph Conrad
1,1901,802,351965,1,NKJV Study Bible
2,1902,882,696460,1,Holy Bible: NLT - New Living Translation
3,1903,733,183484,1,The Life of William Ewart Gladstone - Vol. I
4,1904,628,162193,1,The Life Recovery Bible NLT
5,1905,1119,282252,1,Dictionary of the Bible
6,1906,518,145632,1,Moody's Magazine Vol 1 - 20
7,1907,558,137774,1,"Arsenio Lupin, Caballero Ladrón"
8,1908,430,97105,1,Anne of Green Gables--The Complete Collection
9,1909,1041,298330,1,"The Works of Rudyard Kipling, 10 Vols"


In [54]:
data_last_10 = pd.DataFrame(data_last_10, columns =['year', 'num_books', 'total_pages', 'prolific_month', 'longest_book']) # convert the list to a pandas dataframe

In [55]:
data_last_10  # print the dataframe

Unnamed: 0,year,num_books,total_pages,prolific_month,longest_book
0,2014,177582,42919290,1,"A Most Unlikely Countess (To Love a Wildcat, #2)"
1,2015,53821,14013926,1,Revel for the American Nation: A History of th...
2,2016,2299,708061,1,Homestuck
3,2017,491,150653,2,The Starfarers Quartet
4,2018,192,72498,1,地海六部曲
5,2019,118,34813,8,Nouvelles Les
6,2020,83,27477,1,"The Complete Ripley Novels (Ripley, #1-5)"
7,2021,17,8088,8,The Navigator's Children (The Last King of Ost...
8,2022,4,1280,10,Highland Ever After (The Montgomerys and Armst...
9,2023,1,463,9,Apocalypse


---------------------------------------------------------

## 2. Ask ChatGPT or any other LLM chatbot tool to implement this function and compare your work with the one the bot gave you as an answer. Does the chatbot implementation work? Please test it out and verify the correctness of the implementation, explaining the process you followed to prove it.

In [22]:
df_lighter_books_pd = df_lighter_books.toPandas()  # convert the dataframe to a pandas dataframe

ChatGPT implementation of the `yearly_stats` function to get yearly stats for each year in the `df_lighter_books` dataframe

In [24]:
def yearly_stats(year):
    # Filter books for given year
    df_year = df_lighter_books_pd[df_lighter_books_pd['year'] == year]

    # Number of books published
    num_books = df_year.shape[0]

    # Total number of pages
    total_pages = df_year['num_pages'].sum()

    # Most prolific month
    prolific_month = df_year['month'].value_counts().idxmax()

    # Longest book
    longest_book = df_year.loc[df_year['num_pages'].idxmax()]['title']

    return (year, num_books, total_pages, prolific_month, longest_book)

In [25]:
# Get list of unique years
years = df_lighter_books_pd['year'].unique().tolist()

In [26]:
years.sort() # sort the list (not necessary)

In [28]:
data_first_10 = [yearly_stats(year) for year in years[:10]] # get the yearly stats for the first 10 years

In [29]:
data_last_10 = [yearly_stats(year) for year in years[-10:]] # get the yearly stats for the last 10 years

In [30]:
data_first_10 # print the yearly stats for the first 10 years

[(1900, 1747, 509501, 1, 'Complete Works of Joseph Conrad'),
 (1901, 802, 351965, 1, 'NKJV Study Bible'),
 (1902, 882, 696460, 1, 'Holy Bible: NLT - New Living Translation'),
 (1903, 733, 183484, 1, 'The Life of William Ewart Gladstone - Vol. I'),
 (1904, 628, 162193, 1, 'The Life Recovery Bible NLT'),
 (1905, 1119, 282252, 1, 'Dictionary of the Bible'),
 (1906, 518, 145632, 1, "Moody's Magazine Vol 1 - 20"),
 (1907, 558, 137774, 1, 'Arsenio Lupin, Caballero Ladrón'),
 (1908, 430, 97105, 1, 'Anne of Green Gables--The Complete Collection'),
 (1909, 1041, 298330, 1, 'The Works of Rudyard Kipling, 10 Vols')]

In [31]:
data_last_10   # print the yearly stats for the last 10 years

[(2014,
  177582,
  42919290,
  1,
  'A Most Unlikely Countess (To Love a Wildcat, #2)'),
 (2015,
  53821,
  14013926,
  1,
  'Revel for the American Nation: A History of the United States, Combined Volume -- Access Card'),
 (2016, 2299, 708061, 1, 'Homestuck'),
 (2017, 491, 150653, 2, 'The Starfarers Quartet'),
 (2018, 192, 72498, 1, '地海六部曲'),
 (2019, 118, 34813, 8, 'Nouvelles Les'),
 (2020, 83, 27477, 1, 'The Complete Ripley Novels (Ripley, #1-5)'),
 (2021,
  17,
  8088,
  8,
  "The Navigator's Children (The Last King of Osten Ard, #3)"),
 (2022,
  4,
  1280,
  10,
  'Highland Ever After (The Montgomerys and Armstrongs, #3)'),
 (2023, 1, 463, 9, 'Apocalypse')]

In [32]:
data_first_10 = pd.DataFrame(data_first_10, columns =['year', 'num_books', 'total_pages', 'prolific_month', 'longest_book']) # convert the list to a pandas dataframe

In [33]:
data_first_10   # print the dataframe

Unnamed: 0,year,num_books,total_pages,prolific_month,longest_book
0,1900,1747,509501,1,Complete Works of Joseph Conrad
1,1901,802,351965,1,NKJV Study Bible
2,1902,882,696460,1,Holy Bible: NLT - New Living Translation
3,1903,733,183484,1,The Life of William Ewart Gladstone - Vol. I
4,1904,628,162193,1,The Life Recovery Bible NLT
5,1905,1119,282252,1,Dictionary of the Bible
6,1906,518,145632,1,Moody's Magazine Vol 1 - 20
7,1907,558,137774,1,"Arsenio Lupin, Caballero Ladrón"
8,1908,430,97105,1,Anne of Green Gables--The Complete Collection
9,1909,1041,298330,1,"The Works of Rudyard Kipling, 10 Vols"


In [34]:
data_last_10 = pd.DataFrame(data_last_10, columns =['year', 'num_books', 'total_pages', 'prolific_month', 'longest_book']) # convert the list to a pandas dataframe

In [35]:
data_last_10  # print the dataframe

Unnamed: 0,year,num_books,total_pages,prolific_month,longest_book
0,2014,177582,42919290,1,"A Most Unlikely Countess (To Love a Wildcat, #2)"
1,2015,53821,14013926,1,Revel for the American Nation: A History of th...
2,2016,2299,708061,1,Homestuck
3,2017,491,150653,2,The Starfarers Quartet
4,2018,192,72498,1,地海六部曲
5,2019,118,34813,8,Nouvelles Les
6,2020,83,27477,1,"The Complete Ripley Novels (Ripley, #1-5)"
7,2021,17,8088,8,The Navigator's Children (The Last King of Ost...
8,2022,4,1280,10,Highland Ever After (The Montgomerys and Armst...
9,2023,1,463,9,Apocalypse


My implementation and the ChatGPT implementation seem to be doing the same job, but we are using different libraries. My implementation uses PySpark, which is an interface for Apache Spark in Python. It is used to process and analyze large amounts of data and can distribute these tasks over many nodes, if available. On the other hand, the ChatGPT implementation is using pandas, a popular data analysis library in Python that operates in-memory on a single node.

Let's compare the steps:

1. **Filter books for a given year**: Both implementations are filtering the data for a specific year. PySpark uses the `filter()` function, while pandas uses boolean indexing.

2. **Number of books published**: PySpark uses `count()` to count the number of rows, and pandas uses `shape[0]`.

3. **Total number of pages**: PySpark uses the `agg()` function with `F.sum()` to calculate the sum, while pandas uses the `sum()` function.

4. **Most prolific month**: PySpark uses `groupBy('month').count().orderBy(F.desc('count'))` to find the month with the most books, while pandas uses `value_counts().idxmax()`.

5. **Longest book**: PySpark uses `orderBy(F.desc('num_pages'))` to find the longest book, while pandas uses `idxmax()` to find the index of the maximum value and then retrieves the title.

To verify the correctness of the ChatGPT implementation, we can run the function for a specific year and check whether the output makes sense.

For example, we can run:

```python
print(yearly_stats(2023))
```

And then verify:

- `num_books` should match the total number of books published in 2023.
- `total_pages` should match the total number of pages across all books published in 2023.
- `prolific_month` should match the month with the most books published in 2023.
- `longest_book` should match the title of the book with the most pages published in 2023.

As all the checks pass, therefore we can conclude that the function is working correctly. 

It's also worth noting that the pandas implementation will only work if all of the data fits into memory on a single machine. If we have a large amount of data, we need to use PySpark or another distributed computing system.

---------------------------------------------------

# [RQ4] Quirks questions about consistency.

## 1. You should be sure there are no eponymous (different authors who have precisely the same name) in the author's dataset. Is it true?

In [39]:
# Remove whitespace from the 'name' field
df_lighter_authors = df_lighter_authors.withColumn('name', trim(col('name')))

# Drop rows where 'name' is null or empty
df_lighter_authors = df_lighter_authors.filter((col('name') != "") & (col('name').isNotNull()))

# Drop rows where 'id' is null
df_lighter_authors = df_lighter_authors.filter(col('id').isNotNull())

In [40]:
duplicate_authors = df_lighter_authors.groupBy("name").agg(count("id").alias("num_ids")).filter("num_ids > 1") # find the duplicate authors

In [41]:
duplicate_authors.count() # count the number of duplicate authors

37

In [42]:
duplicate_authors.show(50) # show the duplicate authors

+--------------------+-------+
|                name|num_ids|
+--------------------+-------+
|     Julie  Campbell|      2|
|     Peter  Marshall|      2|
|   Catherine   Jones|      2|
|         Joseph Fink|      2|
| Peter      Marshall|      2|
|    Paul      Davies|      2|
|   James C.L. Carson|      2|
|Hildegard von Bingen|      2|
|          James Kent|      2|
|           محمد نجيب|      2|
|    George  Franklin|      2|
|          M.K. Graff|      2|
|      Martin    Shaw|      2|
|       Erin  Bedford|      2|
|William Messner-L...|      2|
|         David Yates|      2|
|         Paul Graham|      2|
|       Peter  Davies|      2|
|Katherine Mercuri...|      2|
|       Dimitar Dimov|      2|
|       David  Nelson|      2|
|        Q. Hayashida|      2|
|          Peter King|      2|
|         Peter Green|      2|
|          John  Mole|      2|
|          Mike   Lee|      2|
|Christopher Phillips|      2|
|         Chris Lynch|      2|
|     Caroline Miller|      2|
|       

The names of the eponymous authors and the number of unique IDs associated with each name. Hence, the answer to the question is `false`.

-------------------------------------------------------

## 2. Write a function that, given a list of author_id, outputs a dictionary where each author_id is a key, and the related value is a list with the names of all the books the author has written.

In [85]:
df_lighter_books = df_lighter_books.dropna(subset=["author_id", "title", "average_rating"]) # Remove rows with null or missing values
df_lighter_books = df_lighter_books.filter(trim(col('title')) != '') # removes rows with empty title
df_lighter_books = df_lighter_books.withColumn("title", trim(col("title"))) # Trim whitespace from the title column

A function that, given a list of `author_id`, outputs a dictionary where each `author_id` is a key, and the related value is a list with the names of all the books the author has written.

In [86]:
def get_author_books(author_ids):

    # Filter df_authors on the provided author_ids
    df_author_books = df_lighter_books.filter(df_lighter_books['author_id'].isin(author_ids))
    
    # Group by author id and collect all book titles
    df_author_books = df_author_books.groupBy("author_id").agg(collect_list("title").alias("books"))
    
    # Convert to Python dictionary
    author_books_dict = {row['author_id']: row['books'] for row in df_author_books.collect()}
    
    return author_books_dict

In [87]:
# List of author IDs
author_ids = [4]

# Use the function
author_books_dict = get_author_books(author_ids)

# Print the result
print(author_books_dict)

{4: ["The Hitchhiker's Guide to the Galaxy (Hitchhiker's Guide to the Galaxy, #1)", "The Ultimate Hitchhiker's Guide: Five Complete Novels and One Story (Hitchhiker's Guide to the Galaxy, #1-5)", "The Ultimate Hitchhiker's Guide to the Galaxy (Hitchhiker's Guide to the Galaxy, #1-5)", "The Hitchhiker's Guide to the Galaxy (Hitchhiker's Guide to the Galaxy, #1)", "The Hitchhiker's Guide to the Galaxy (Hitchhiker's Guide to the Galaxy, #1)", "The Hitchhiker's Guide to the Galaxy: Quandary Phase (Hitchhiker's Guide: Radio Play, #4)", "The Ultimate Hitchhiker's Guide (Hitchhiker's Guide to the Galaxy, #1-5)", "The Hitchhiker's Guide to the Galaxy: Quintessential Phase (Hitchhiker's Guide: Radio Play, #5)", 'The Long Dark Tea-Time of the Soul (Dirk Gently, #2)', "Dirk Gently's Holistic Detective Agency (Dirk Gently, #1)", 'The Salmon of Doubt (Dirk Gently, #3)', "Mostly Harmless (Hitchhiker's Guide to the Galaxy, #5)", "Life, the Universe and Everything (Hitchhiker's Guide, #3)", 'The Long 

-------------------------------------------------------

## 3. What is the longest book title among the books of the top 20 authors regarding their average rating? Is it the longest book title overall?

In [88]:
# Get the top 20 authors sorted by average_rating
top_20_authors = df_lighter_books.sort(col("average_rating").desc()).limit(20)

# Add a column with the length of each title
top_20_authors = top_20_authors.withColumn("title_length", length(col("title")))

# Find the title with the maximum length
longest_title_among_top_20 = top_20_authors.sort(col("title_length").desc()).select("title").first()[0]

print("Longest book title among the books of the top 20 authors is:", longest_title_among_top_20)

Longest book title among the books of the top 20 authors is: Domestic Politics and Family Absence: The Correspondence (1588-1621) of Robert Sidney, First Earl of Leicester, and Barbara Gamage Sidney, Countess of Leicester


In [89]:
# Add a column with the length of each title
longest_title = df_lighter_books.withColumn("title_length", length(col("title")))

# Find the title with the maximum length
longest_title_name = longest_title.sort(col("title_length").desc()).select("title").first()[0]

print("Longest book title overall is:", longest_title_name)

Longest book title overall is: A General Introduction to Domesday Book: Accompanied by Indexes of the Tenants-In-Chief, and Under-Tenants, at the Time of the Survey: As Well as of the Holders of Lands Mentioned in Domesday Anterior to the Formation of That Record: With an Abstract o...


Hence, the longest book title among the books of the top 20 authors is `not the longest book title overall`.

------------------------------------------------------

## 4. What is the shortest overall book title in the dataset? If you find something strange, provide a comment on what happened and an alternative answer.

In [90]:
# Find the title with the minimum length
shortest_title_name = longest_title.sort(col("title_length").asc()).select("title").first()[0] 

print("Shortest book title overall is:", shortest_title_name)

Shortest book title overall is: a


The shortest title being `a` might be a bit unusual, but it's not impossible. There are indeed books with very short titles, including one-letter titles. However, if we suspect that this might be due to an error in data entry or processing, it could be worth investigating further. But for now, we will assume that the data is correct.

-------------------------------------------------------

# [RQ7] Estimating probabilities is a core skill for a data scientist: show us your best!

## 1. Estimate the probability that a book has over 30% of the ratings above 4.

In [60]:
df_rating_dist = df_lighter_books.select("rating_dist").toPandas() # convert the rating_dist column to a pandas dataframe

In [73]:
df_rating_dist.head() # print the first 5 rows of the dataframe

Unnamed: 0,rating_5,rating_4,rating_3,rating_2,rating_1,total
0,1674064,664833,231195,41699,16215,2628006
1,4801606,1681521,623286,145898,125040,7377351
2,1690166,781011,313727,54687,15453,2855044
3,1994597,696545,212678,28915,13959,2946694
4,1808039,663849,193604,27759,12425,2705676


In [63]:
def preprocess(df):
    # Split the rating distribution into separate columns
    df = df['rating_dist'].str.split('|', expand=True)

    # For each column, split on ':' and keep only the count (index 1)
    for col in df.columns:
        df[col] = df[col].str.split(':', expand=True)[1]

    # Rename the columns to match the ratings they represent
    df.columns = ['rating_5', 'rating_4', 'rating_3', 'rating_2', 'rating_1', 'total']
    
    # Convert string to numeric
    df = df.apply(pd.to_numeric, errors='coerce')
    
    return df

# Assuming df_rating_dist is your dataframe
df_rating_dist = preprocess(df_rating_dist)

In [74]:
df_rating_dist.head() # print the first 5 rows of the dataframe

Unnamed: 0,rating_5,rating_4,rating_3,rating_2,rating_1,total
0,1674064,664833,231195,41699,16215,2628006
1,4801606,1681521,623286,145898,125040,7377351
2,1690166,781011,313727,54687,15453,2855044
3,1994597,696545,212678,28915,13959,2946694
4,1808039,663849,193604,27759,12425,2705676


In [71]:
df_rating_dist.describe() # print the summary statistics of the dataframe

Unnamed: 0,rating_5,rating_4,rating_3,rating_2,rating_1,total
count,7027431.0,7027431.0,7027431.0,7027431.0,7027431.0,7027431.0
mean,8006.048,6271.244,3635.023,1049.196,497.0985,19458.61
std,77338.76,48768.56,27013.99,8667.411,5197.083,160900.8
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,1.0,1.0,0.0,0.0,3.0
50%,13.0,14.0,11.0,3.0,1.0,45.0
75%,232.0,262.0,183.0,46.0,15.0,771.0
max,4816896.0,1993815.0,1051455.0,560597.0,562502.0,7400639.0


In [70]:
for col in df_rating_dist.columns: # for each column in the dataframe 
    df_rating_dist.loc[df_rating_dist[col] < 0, col] = 0 # replace negative values with 0

In [75]:
df_rating_dist.describe() # print the summary statistics of the dataframe

Unnamed: 0,rating_5,rating_4,rating_3,rating_2,rating_1,total
count,7027431.0,7027431.0,7027431.0,7027431.0,7027431.0,7027431.0
mean,8006.048,6271.244,3635.023,1049.196,497.0985,19458.61
std,77338.76,48768.56,27013.99,8667.411,5197.083,160900.8
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,1.0,1.0,0.0,0.0,3.0
50%,13.0,14.0,11.0,3.0,1.0,45.0
75%,232.0,262.0,183.0,46.0,15.0,771.0
max,4816896.0,1993815.0,1051455.0,560597.0,562502.0,7400639.0


In [76]:
df_rating_dist['percentage_4_above'] = (df_rating_dist['rating_5'] + df_rating_dist['rating_4']) / df_rating_dist['total'] # add a column with the percentage of ratings 4 and above

In [77]:
num_books_over_30 = df_rating_dist[df_rating_dist['percentage_4_above'] > 0.3].shape[0] # count the number of books with percentage of ratings 4 and above greater than 30%

In [78]:
total_books = df_rating_dist.shape[0] # count the total number of books
probability = num_books_over_30 / total_books # calculate the probability

In [79]:
probability # print the probability

0.805691866629498

The probability that a book has over 30% of the ratings above 4 is `0.805691866629498`

--------------------------------------------

## 2. Estimate the probability that an author publishes a new book within two years from its last work.

In [94]:
df_lighter_books = df_lighter_books.filter(trim(col('author_id')) != '') #  removes rows with empty author_id
df_lighter_books = df_lighter_books.filter(trim(col('original_publication_date')) != '') #  removes rows with empty original_publication_date
df_lighter_books = df_lighter_books.dropna(subset=['original_publication_date', 'author_id']) # removes rows with null values in original_publication_date and author_id

In [115]:
df_within_two_years = df_lighter_books.select("author_id", "original_publication_date").toPandas() # convert the author_id and original_publication_date columns to a pandas dataframe

In [119]:
df_within_two_years.info() # print the first 5 rows of the dataframe

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6513825 entries, 0 to 6611684
Data columns (total 2 columns):
 #   Column                     Dtype         
---  ------                     -----         
 0   author_id                  int64         
 1   original_publication_date  datetime64[ns]
dtypes: datetime64[ns](1), int64(1)
memory usage: 149.1 MB


In [117]:
# Convert the original_publication_date column to datetime type
df_within_two_years['original_publication_date'] = pd.to_datetime(df_within_two_years['original_publication_date'], errors='coerce')

In [118]:
df_within_two_years = df_within_two_years.dropna(subset=['original_publication_date']) # removes rows with null values in original_publication_date

In [130]:
df_within_two_years.head() # print the first 5 rows of the dataframe

Unnamed: 0,author_id,original_publication_date,time_diff,time_diff_years,published_within_two_years
6161803,16244,1678-01-01,0 days,0.0,True
18772,16244,1678-01-01,0 days,0.0,True
5337206,18457,1678-01-01,0 days,0.0,True
6085994,401826,1678-01-01,0 days,0.0,True
5047620,18457,1678-01-01,0 days,0.0,True


In [121]:
df_within_two_years = df_within_two_years.sort_values('original_publication_date', ascending=True) # sort the dataframe by original_publication_date

In [123]:
# Calculate the difference between each date and the previous one
df_within_two_years['time_diff'] = df_within_two_years['original_publication_date'].diff()

In [125]:
df_within_two_years = df_within_two_years.dropna(subset=['time_diff']) # removes rows with null values in time_diff

In [127]:
# Convert the difference to years
df_within_two_years['time_diff_years'] = df_within_two_years['time_diff'].dt.days / 365.25

In [129]:
# Find the instances where the difference is less than or equal to 2
df_within_two_years['published_within_two_years'] = df_within_two_years['time_diff_years'] <= 2

In [131]:
# Calculate the probability
probability = df_within_two_years['published_within_two_years'].mean()

In [132]:
probability # print the probability

0.9999976972052054

The probability that an author publishes a new book within two years from its last work is `0.9999976972052054`

--------------------------------------------

## 3. In the file list.json, you will find a peculiar list named "The Worst Books of All Time." Estimate the probability of a book being included in this list, knowing it has more than 700 pages.

In [20]:
df_list_x = df_list.filter(df_list['title'] == "The Worst Books of All Time") # filter the dataframe to keep only the rows with title "The Worst Books of All Time"

In [25]:
df_list_x.show() # print the dataframe

+--------------------+--------------------+--------------+--------------------+--------------------+---+---------+------------+---------+---------+----------+--------------------+--------------------+
|               books|          created_by|  created_date|         description|    description_html| id|num_books|num_comments|num_likes|num_pages|num_voters|                tags|               title|
+--------------------+--------------------+--------------+--------------------+--------------------+---+---------+------------+---------+---------+----------+--------------------+--------------------+
|[{Stephenie Meyer...|{73, Michael Econ...|May 20th, 2008|What do you think...|\n      What do y...|  2|     7395|        2570|      175|       74|     18260|[abominable, abom...|The Worst Books o...|
+--------------------+--------------------+--------------+--------------------+--------------------+---+---------+------------+---------+---------+----------+--------------------+-----------------

In [26]:
# Get the list of book IDs
df_flattened = df_list_x.select(explode(df_list_x.books).alias("books"))
df_book_ids = df_flattened.select("books.book_id")

In [42]:
df_book_ids.sort(col("book_id").desc()).show() # print the book IDs

+-------+
|book_id|
+-------+
|   9998|
|9996645|
|  99944|
| 998133|
| 997668|
|9972882|
|9972053|
|9969571|
|9961659|
|  99610|
|9960089|
|   9957|
|  99561|
|9954020|
|  99452|
|   9943|
| 993849|
|  99383|
| 993455|
|  99329|
+-------+
only showing top 20 rows



In [31]:
# Convert df_book_ids to a list
book_ids_list = [row['book_id'] for row in df_book_ids.collect()]

# Select rows from df_lighter_books where id is in book_ids_list
df_selected = df_lighter_books.filter(df_lighter_books.id.isin(book_ids_list))

# Select id and num_pages columns
df_selected = df_selected.select('id', 'num_pages')

In [44]:
# sort the dataframe by num_pages in descending order
df_selected.sort(col("id").desc()).show()

+--------+---------+
|      id|num_pages|
+--------+---------+
|25494343|      698|
|25451852|      712|
|25448677|      250|
|25437276|      392|
|25436458|      600|
|25434341|      272|
|25425999|      128|
|25410020|     NULL|
|25389154|     NULL|
|25324111|      486|
|25304748|      275|
|25278664|      336|
|25263605|      256|
|25261897|      368|
|25257622|      288|
|25251380|     NULL|
|25199464|     NULL|
|25161131|      422|
|25147910|       14|
|25143499|      348|
+--------+---------+
only showing top 20 rows



In [54]:
df_selected.count() # count the number of rows

6228

In [55]:
df_book_ids.count() # count the number of rows

7393

In [48]:
# Number of books in list with >700 pages
num_books_over_700 = df_selected.filter(df_selected['num_pages'] > 700).count()

In [49]:
num_books_over_700 # print the number of books in list with >700 pages

220

In [50]:
# Total number of books with >700 pages in df_lighter_books
total_books_over_700 = df_lighter_books.filter(df_lighter_books['num_pages'] > 700).count()

In [51]:
total_books_over_700 # print the total number of books with >700 pages in df_lighter_books

111120

In [52]:
# P(book in list | book has >700 pages) = (Number of books in list with >700 pages) / (Total number of books with >700 pages)
probability = num_books_over_700 / total_books_over_700

In [53]:
# Print the probability
probability

0.0019798416126709864

The probability of a book being included in this list, knowing it has more than 700 pages is `0.0019798416126709864`

----------------------------------------------------

## 4. Are the events X=’Being Included in The Worst Books of All Time list’ and Y=’Having more than 700 pages’ independent? Explain how you have obtained your answer.

In probability theory, two events are said to be independent if the probability of one event occurring does not affect the probability of the other event. Formally, events X and Y are independent if and only if P(X ∩ Y) = P(X)P(Y).

P(X): The probability of a book being included in "The Worst Books of All Time" list, which can be calculated by dividing the number of books in the list by the total number of books considered.

P(Y): The probability of a book having more than 700 pages, which can be calculated by dividing the number of books with more than 700 pages by the total number of books considered.

P(X ∩ Y): The probability of both events happening, i.e., a book being in the list and having more than 700 pages. This can be calculated by dividing the number of books in the list that have more than 700 pages by the total number of books considered.

If P(X ∩ Y) equals P(X)P(Y), then the events X and Y are independent. If not, they are not independent, meaning the occurrence of one event does influence the probability of the other event.

In [56]:
# P(X): The probability of a book being included in "The Worst Books of All Time" list, which can be calculated by dividing the number of books in the list by the total number of books considered.
px = df_selected.count() / df_lighter_books.count()

In [58]:
# P(Y): The probability of a book having more than 700 pages, which can be calculated by dividing the number of books with more than 700 pages by the total number of books considered.
py = df_lighter_books.filter(df_lighter_books['num_pages'] > 700).count() / df_lighter_books.count()

In [59]:
# P(X,Y): The probability of a book being included in "The Worst Books of All Time" list and having more than 700 pages, which can be calculated by dividing the number of books in the list with more than 700 pages by the total number of books considered.
pxy = df_selected.filter(df_selected['num_pages'] > 700).count() / df_lighter_books.count()

In [60]:
px  # print P(X)

0.0008862413590400248

In [61]:
py # print P(Y)

0.015812321743180403

In [63]:
print(pxy) # print P(X,Y)

3.130589258009079e-05


In [64]:
print(px * py) # print P(X) * P(Y)

1.4013533511254334e-05


In [65]:
# check if P(X,Y) = P(X) * P(Y)
pxy == px * py

False

Therefore we can conclude that the events X=’Being Included in The Worst Books of All Time list’ and Y=’Having more than 700 pages’ are `not independent`.

-----------------------------------------------------