In [1]:
import findspark
findspark.init()

import pyspark
from pyspark.sql.types import FloatType, IntegerType, DateType

from pyspark.sql import *
from pyspark.sql import functions as sf
from pyspark.sql.functions import col
from pyspark.sql.functions import to_date

In [2]:
sc = pyspark.SparkContext(appName="Preprocessing dataset")
spark = SparkSession(sc)

25/08/30 10:59:03 WARN Utils: Your hostname, dsbda-vm resolves to a loopback address: 127.0.1.1; using 10.0.2.15 instead (on interface enp0s3)
25/08/30 10:59:03 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/08/30 10:59:03 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/08/30 10:59:04 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
hdfs_path="hdfs://localhost:54310/user/ubuntu/dataset/"
schema="BookID string, Title string, Authors string, Avg_Rating string, ISBN string, ISBN13 string, Language_Code string, Num_Pages string, Ratings_Count string, Text_Reviews_Count string, Publication_Date string, Publisher string"

df = spark.read.csv(hdfs_path, header=False, inferSchema=False, sep=",", quote='', escape='', schema=schema)

In [4]:
# Dropping uninformative columns
df=df.drop("BookID", "ISBN", "Language_Code")

In [5]:
# Preprocessing
df = df.select([sf.regexp_replace(c, r'\\|\[|\]|\"', '').alias(c) for c in df.columns])
df = df.select([sf.regexp_replace(c, r'^ \'|\'$|', '').alias(c) for c in df.columns])

In [6]:
# Casting column types
df = df.withColumn("Avg_Rating", col("Avg_Rating").cast(FloatType())).withColumn("Num_Pages", col("Num_Pages").cast(IntegerType())).withColumn("Ratings_Count", col("Ratings_Count").cast(IntegerType())).withColumn("Text_Reviews_Count", col("Text_Reviews_Count").cast(IntegerType()))

df = df.withColumn("Pub_Date",
  sf.coalesce(
    to_date(col("Publication_Date"), "M/d/yyyy"),
    to_date(col("Publication_Date"), "MM/d/yyyy"),
    to_date(col("Publication_Date"), "M/dd/yyyy"),
    to_date(col("Publication_Date"), "MM/dd/yyyy"),
  ).alias("Pub_Date"),
)

df = df.drop("Publication_Date")

In [7]:
df.show()

+--------------------+--------------------+----------+-------------+---------+-------------+------------------+--------------------+----------+
|               Title|             Authors|Avg_Rating|       ISBN13|Num_Pages|Ratings_Count|Text_Reviews_Count|           Publisher|  Pub_Date|
+--------------------+--------------------+----------+-------------+---------+-------------+------------------+--------------------+----------+
|Good Poems for Ha...|Garrison Keillor/...|      4.14|9780143037675|      344|         2662|               214|       Penguin Books|2006-08-29|
|Baseball: a Liter...|Nicholas Dawidoff...|      4.24|9781931082099|      733|          182|                14|  Library of America|2002-03-04|
|Plato: Complete W...|Plato/John M. Coo...|      4.35|9780872203495|     1838|         9284|               133|Hackett Publishin...|1997-05-01|
|The Mammoth Book ...|Mike Ashley/Eric ...|      3.63|9780786714957|      498|           28|                 4|       Running Press|2005

In [8]:
# Saving Preprocessed dataset on hfds
hdfs_path_out="hdfs://localhost:54310/user/ubuntu/dataset_preprocessed/"
df.write.csv(hdfs_path_out, header=False)

AnalysisException: path hdfs://localhost:54310/user/ubuntu/dataset_preprocessed already exists.

In [9]:
spark.stop()