In [1]:
# Import Dependencies
import pandas as pd

from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType, FloatType
from pyspark.sql.functions import col, explode, when, sum, size

# Initialize Spark Session
spark = SparkSession.builder.appName("BookRatingDataCleaning").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/29 00:09:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/11/29 00:09:42 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [2]:
# Read in data
df = spark.read.csv('../../Resources/panda_df/books_sample_trial2.csv', header=True)
df.show()

+-------------+--------------+--------+--------------------+---------+----------------+-------------+------------------+---------------------+
|       isbn13|average_rating| book_id|              format|num_pages|publication_year|ratings_count|text_reviews_count|author_average_rating|
+-------------+--------------+--------+--------------------+---------+----------------+-------------+------------------+---------------------+
|0000814474233|          3.62|  598454|           Hardcover|      220|            2007|           97|                19|                 3.62|
|0008520919197|          4.07| 3623612|           Paperback|      288|            2006|           21|                 2|                 3.97|
|0008987086307|          4.25| 1776757|           Paperback|      250|            2002|           14|                 1|                 4.35|
|0031809055007|           4.1|  402191|           Paperback|      480|            2000|          402|                26|                 4.13|

In [3]:
df.printSchema()

root
 |-- isbn13: string (nullable = true)
 |-- average_rating: string (nullable = true)
 |-- book_id: string (nullable = true)
 |-- format: string (nullable = true)
 |-- num_pages: string (nullable = true)
 |-- publication_year: string (nullable = true)
 |-- ratings_count: string (nullable = true)
 |-- text_reviews_count: string (nullable = true)
 |-- author_average_rating: string (nullable = true)



In [4]:
df.count()

83939

In [5]:
# Change dataframe to pandas dataframe
books_df = df.toPandas()
books_df.head()

Unnamed: 0,isbn13,average_rating,book_id,format,num_pages,publication_year,ratings_count,text_reviews_count,author_average_rating
0,814474233,3.62,598454,Hardcover,220,2007,97,19,3.62
1,8520919197,4.07,3623612,Paperback,288,2006,21,2,3.97
2,8987086307,4.25,1776757,Paperback,250,2002,14,1,4.35
3,31809055007,4.1,402191,Paperback,480,2000,402,26,4.13
4,73999304862,3.68,1126975,Paperback,122,1999,22,2,3.82


In [6]:
books_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83939 entries, 0 to 83938
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   isbn13                 83939 non-null  object
 1   average_rating         83939 non-null  object
 2   book_id                83939 non-null  object
 3   format                 83939 non-null  object
 4   num_pages              83939 non-null  object
 5   publication_year       83939 non-null  object
 6   ratings_count          83939 non-null  object
 7   text_reviews_count     83939 non-null  object
 8   author_average_rating  83939 non-null  object
dtypes: object(9)
memory usage: 5.8+ MB


In [7]:
# Check for null values
books_df.value_counts().isnull().sum()

0

In [8]:
# Change data types
# Convert columns to numeric, coercing errors to NaN
books_df['num_pages'] = pd.to_numeric(books_df['num_pages'], errors='coerce')
books_df['publication_year'] = pd.to_numeric(books_df['publication_year'], errors='coerce')
books_df['ratings_count'] = pd.to_numeric(books_df['ratings_count'], errors='coerce')
books_df['text_reviews_count'] = pd.to_numeric(books_df['text_reviews_count'], errors='coerce')

books_df['average_rating'] = pd.to_numeric(books_df['average_rating'], errors='coerce')
books_df['author_average_rating'] = pd.to_numeric(books_df['author_average_rating'], errors='coerce')

# Drop rows with NaN values in these columns
books_df = books_df.dropna(subset=['num_pages', 'publication_year', 'ratings_count', 'text_reviews_count', 'average_rating', 'author_average_rating'])

# Convert columns to integer
books_df = books_df.astype({'num_pages': 'int32', 'publication_year': 'int32', 'ratings_count': 'int32',
                            'text_reviews_count': 'int32'})

clean_df = books_df.astype({'average_rating': 'float32', 'author_average_rating': 'float32'})

clean_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83939 entries, 0 to 83938
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   isbn13                 83939 non-null  object 
 1   average_rating         83939 non-null  float32
 2   book_id                83939 non-null  object 
 3   format                 83939 non-null  object 
 4   num_pages              83939 non-null  int32  
 5   publication_year       83939 non-null  int32  
 6   ratings_count          83939 non-null  int32  
 7   text_reviews_count     83939 non-null  int32  
 8   author_average_rating  83939 non-null  float32
dtypes: float32(2), int32(4), object(3)
memory usage: 3.8+ MB


In [9]:
clean_df.describe()

Unnamed: 0,average_rating,num_pages,publication_year,ratings_count,text_reviews_count,author_average_rating
count,83939.0,83939.0,83939.0,83939.0,83939.0,83939.0
mean,3.86213,278.002359,2007.283813,795.678,52.080177,3.890725
std,0.361524,163.228148,8.939775,15311.51,484.101807,0.274943
min,1.0,0.0,1921.0,10.0,0.0,1.12
25%,3.65,176.0,2004.0,22.0,4.0,3.74
50%,3.89,264.0,2010.0,51.0,8.0,3.905
75%,4.11,356.0,2013.0,158.0,22.0,4.065
max,5.0,1495.0,2020.0,2758812.0,45748.0,5.0


In [10]:
clean_df['format'].value_counts()

Paperback                51841
Hardcover                21175
ebook                     5794
Mass Market Paperback     2517
Audio CD                   840
                         ...  
Librino                      1
Book                         1
trade paperback              1
Print and Download           1
Box                          1
Name: format, Length: 152, dtype: int64

In [11]:
# Change unpoupular formats to 'other'
clean_df.loc[(clean_df['format'] != 'Paperback') & (clean_df['format'] != 'Hardcover') & (clean_df['format'] != 'ebook'), 'format'] = 'other'
clean_df['format'].value_counts()

Paperback    51841
Hardcover    21175
ebook         5794
other         5129
Name: format, dtype: int64

In [12]:
# Export dataframe to csv
clean_df.to_csv('../../Resources/panda_df/books_cleaned_Trial2.csv', index=False)