In [22]:
# Import Dependencies
import pandas as pd

from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType, FloatType
from pyspark.sql.functions import col, explode, when, sum, size

# Initialize Spark Session
spark = SparkSession.builder.appName("BookRatingDataCleaning").getOrCreate()

In [23]:
# Read in data
df = spark.read.csv('../../Resources/panda_df/books_cleaned_final.csv', header=True)
df.show()

+-------------+--------------+--------+---------+---------+----------------+-------------+------------------+---------------------+
|       isbn13|average_rating| book_id|   format|num_pages|publication_year|ratings_count|text_reviews_count|author_average_rating|
+-------------+--------------+--------+---------+---------+----------------+-------------+------------------+---------------------+
|0000000067317|          4.36| 2745937| Hardback|      298|            1994|           27|                 2|                 4.14|
|0000195118146|          3.75|  679038|Paperback|      448|            1997|           44|                 4|                3.865|
|0000195397916|          3.32| 7272274|Hardcover|      208|            2010|           86|                15|                 3.37|
|0000262541785|          4.06|  593335|Paperback|      233|            2004|          138|                11|                 4.06|
|0000340826681|          4.09|  332824|Paperback|      288|            2004|

In [24]:
df.printSchema()

root
 |-- isbn13: string (nullable = true)
 |-- average_rating: string (nullable = true)
 |-- book_id: string (nullable = true)
 |-- format: string (nullable = true)
 |-- num_pages: string (nullable = true)
 |-- publication_year: string (nullable = true)
 |-- ratings_count: string (nullable = true)
 |-- text_reviews_count: string (nullable = true)
 |-- author_average_rating: string (nullable = true)



In [25]:
df.count()

838200

In [26]:
# Change dataframe to pandas dataframe
books_df = df.toPandas()
books_df.head()

Unnamed: 0,isbn13,average_rating,book_id,format,num_pages,publication_year,ratings_count,text_reviews_count,author_average_rating
0,67317,4.36,2745937,Hardback,298,1994,27,2,4.14
1,195118146,3.75,679038,Paperback,448,1997,44,4,3.865
2,195397916,3.32,7272274,Hardcover,208,2010,86,15,3.37
3,262541785,4.06,593335,Paperback,233,2004,138,11,4.06
4,340826681,4.09,332824,Paperback,288,2004,32,2,4.1


In [27]:
books_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 838200 entries, 0 to 838199
Data columns (total 9 columns):
 #   Column                 Non-Null Count   Dtype 
---  ------                 --------------   ----- 
 0   isbn13                 838200 non-null  object
 1   average_rating         838200 non-null  object
 2   book_id                838200 non-null  object
 3   format                 838200 non-null  object
 4   num_pages              838200 non-null  object
 5   publication_year       838200 non-null  object
 6   ratings_count          838200 non-null  object
 7   text_reviews_count     838200 non-null  object
 8   author_average_rating  838200 non-null  object
dtypes: object(9)
memory usage: 57.6+ MB


In [28]:
# Check for null values
books_df.value_counts().isnull().sum()

0

In [29]:
# Change data types
# Convert columns to numeric, coercing errors to NaN
books_df['num_pages'] = pd.to_numeric(books_df['num_pages'], errors='coerce')
books_df['publication_year'] = pd.to_numeric(books_df['publication_year'], errors='coerce')
books_df['ratings_count'] = pd.to_numeric(books_df['ratings_count'], errors='coerce')
books_df['text_reviews_count'] = pd.to_numeric(books_df['text_reviews_count'], errors='coerce')

books_df['average_rating'] = pd.to_numeric(books_df['average_rating'], errors='coerce')
books_df['author_average_rating'] = pd.to_numeric(books_df['author_average_rating'], errors='coerce')

# Drop rows with NaN values in these columns
books_df = books_df.dropna(subset=['num_pages', 'publication_year', 'ratings_count', 'text_reviews_count', 'average_rating', 'author_average_rating'])

# Convert columns to integer
books_df = books_df.astype({'num_pages': 'int32', 'publication_year': 'int32', 'ratings_count': 'int32',
                            'text_reviews_count': 'int32'})

clean_df = books_df.astype({'average_rating': 'float32', 'author_average_rating': 'float32'})

clean_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 838200 entries, 0 to 838199
Data columns (total 9 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   isbn13                 838200 non-null  object 
 1   average_rating         838200 non-null  float32
 2   book_id                838200 non-null  object 
 3   format                 838200 non-null  object 
 4   num_pages              838200 non-null  int32  
 5   publication_year       838200 non-null  int32  
 6   ratings_count          838200 non-null  int32  
 7   text_reviews_count     838200 non-null  int32  
 8   author_average_rating  838200 non-null  float32
dtypes: float32(2), int32(4), object(3)
memory usage: 38.4+ MB


In [30]:
clean_df.describe()

Unnamed: 0,average_rating,num_pages,publication_year,ratings_count,text_reviews_count,author_average_rating
count,838200.0,838200.0,838200.0,838200.0,838200.0,838200.0
mean,3.861705,277.535141,2007.252286,789.2949,51.917733,3.890093
std,0.361214,162.98544,8.980084,17303.99,571.329012,0.275013
min,1.0,0.0,1901.0,10.0,0.0,1.0
25%,3.65,176.0,2004.0,22.0,4.0,3.74
50%,3.88,264.0,2010.0,51.0,8.0,3.9
75%,4.11,356.0,2013.0,158.0,22.0,4.06
max,5.0,1500.0,2021.0,4899965.0,142645.0,5.0


In [31]:
clean_df['format'].value_counts()

Paperback                         516886
Hardcover                         212023
ebook                              57702
Mass Market Paperback              25353
Audio CD                            8573
                                   ...  
Poetry Chapbook                        1
Hardcover, 11&quot; x 11&quot;         1
magazine                               1
Paperback &amp; Hard cover             1
Leather-finish bound                   1
Name: format, Length: 640, dtype: int64

In [32]:
# Change unpoupular formats to 'other'
clean_df.loc[(clean_df['format'] != 'Paperback') & (clean_df['format'] != 'Hardcover') & (clean_df['format'] != 'ebook'), 'format'] = 'other'
clean_df['format'].value_counts()

Paperback    516886
Hardcover    212023
ebook         57702
other         51589
Name: format, dtype: int64

In [33]:
# Export dataframe to csv
clean_df.to_csv('../../Resources/panda_df/books_cleaned_sampleFinal1.csv', index=False)