## Data Cleaning with spark
### in this notebook the useless columns will be removed

### **PLEASE NOTE :**  
### Since this script stores the results in hadoop, execute it only once, otherwise an error will be thrown

---

### Import Libraries

In [60]:
# import libraries
import pandas as pd
import pyspark as ps
from pyspark.sql.functions import col, sum
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit
import findspark

### Initialize Spark

In [61]:
# Locate the spark installation
findspark.init()

# Initialize a SparkContext
spark = SparkSession.builder.appName("data_cleaning").getOrCreate()
spark.stop()
sc = ps.SparkContext(appName="data_cleaning")

# Initialize the Session
spark_session = ps.sql.SparkSession(sc)

### Connect and import data from HDFS directly into a Spark DataFrame

In [62]:
# Define schema for better manipulation

data_schema = StructType([
    StructField("Title", StringType(), True),
    StructField("description", StringType(), True),
    StructField("authors", StringType(), True),
    StructField("image", StringType(), True),
    StructField("previewLink", StringType(), True),
    StructField("publisher", StringType(), True),
    StructField("publishedDate", StringType(), True),
    StructField("infoLink", StringType(), True),
    StructField("categories", StringType(), True),
    StructField("ratingsCount", FloatType(), True)
])

ratings_schema = StructType([
    StructField("Id", IntegerType(), True),
    StructField("Title", StringType(), True),
    StructField("Price", FloatType(), True),
    StructField("User_id", IntegerType(), True),
    StructField("profileName", StringType(), True),
    StructField("review/helpfulness", StringType(), True),
    StructField("review/score", FloatType(), True),
    StructField("review/time", IntegerType(), True),
    StructField("review/summary", StringType(), True),
    StructField("review/text", StringType(), True)
])

# Load the data

df_data = spark_session.read.option('escape', '"').csv(
    'hdfs://localhost:9900/user/book_reviews/books_data.csv', header=True, schema=data_schema)
df_ratings = spark_session.read.option('escape', '"').csv(
    'hdfs://localhost:9900/user/book_reviews/books_rating.csv', header=True, schema=ratings_schema)

### Remove useless columns

In [63]:
df_ratings = df_ratings.drop(df_ratings.Id)
df_ratings.show(5)

+--------------------+-----+-------+--------------------+------------------+------------+-----------+--------------------+--------------------+
|               Title|Price|User_id|         profileName|review/helpfulness|review/score|review/time|      review/summary|         review/text|
+--------------------+-----+-------+--------------------+------------------+------------+-----------+--------------------+--------------------+
|Its Only Art If I...| null|   null|Jim of Oz "jim-of...|               7/7|         4.0|  940636800|Nice collection o...|This is only for ...|
|Dr. Seuss: Americ...| null|   null|       Kevin Killian|             10/10|         5.0| 1095724800|   Really Enjoyed It|I don't care much...|
|Dr. Seuss: Americ...| null|   null|        John Granger|             10/11|         5.0| 1078790400|Essential for eve...|If people become ...|
|Dr. Seuss: Americ...| null|   null|Roy E. Perry "ama...|               7/7|         4.0| 1090713600|Phlip Nel gives s...|Theodore Seuss

### Add a column that contains the value *R* for the ratings table and *D* for books data table.   
### This column will be used during the mapReduce, to distinguish the two tables 

In [64]:
df_ratings = df_ratings.withColumn("Table", lit("R"))
df_data = df_data.withColumn("Table", lit("D"))
df_ratings.show(5)
df_data.show(5)

+--------------------+-----+-------+--------------------+------------------+------------+-----------+--------------------+--------------------+-----+
|               Title|Price|User_id|         profileName|review/helpfulness|review/score|review/time|      review/summary|         review/text|Table|
+--------------------+-----+-------+--------------------+------------------+------------+-----------+--------------------+--------------------+-----+
|Its Only Art If I...| null|   null|Jim of Oz "jim-of...|               7/7|         4.0|  940636800|Nice collection o...|This is only for ...|    R|
|Dr. Seuss: Americ...| null|   null|       Kevin Killian|             10/10|         5.0| 1095724800|   Really Enjoyed It|I don't care much...|    R|
|Dr. Seuss: Americ...| null|   null|        John Granger|             10/11|         5.0| 1078790400|Essential for eve...|If people become ...|    R|
|Dr. Seuss: Americ...| null|   null|Roy E. Perry "ama...|               7/7|         4.0| 1090713600

### Store the results in hadoop

In [65]:
df_ratings.repartition(1).write.option('escape', '"').csv(
    'hdfs://localhost:9900/user/book_reviews/books_rating.csv', mode='overwrite', header=True)

df_data.repartition(1).write.option('escape', '"').csv(
    'hdfs://localhost:9900/user/book_reviews/books_data.csv', mode='overwrite', header=True)

                                                                                

### Check whether the columns has been correctly removed

In [66]:
ratings_df = spark_session.read.option('escape', '"').csv(
    'hdfs://localhost:9900/user/book_reviews/books_rating.csv', header=True, inferSchema=True)
ratings_df.printSchema()
ratings_df.describe().show()
ratings_df.show(5)

                                                                                

root
 |-- Title: string (nullable = true)
 |-- Price: double (nullable = true)
 |-- User_id: string (nullable = true)
 |-- profileName: string (nullable = true)
 |-- review/helpfulness: string (nullable = true)
 |-- review/score: double (nullable = true)
 |-- review/time: integer (nullable = true)
 |-- review/summary: string (nullable = true)
 |-- review/text: string (nullable = true)
 |-- Table: string (nullable = true)





+-------+--------------------+------------------+-------+-----------+------------------+------------------+--------------------+--------------------+--------------------+-------+
|summary|               Title|             Price|User_id|profileName|review/helpfulness|      review/score|         review/time|      review/summary|         review/text|  Table|
+-------+--------------------+------------------+-------+-----------+------------------+------------------+--------------------+--------------------+--------------------+-------+
|  count|             2999792|            481171|      0|    2438097|           3000000|           3000000|             3000000|             2999962|             2999992|3000000|
|   mean|   2012.796651763537|21.762655874938666|   null|        NaN|              null| 4.215289333333334| 1.132306772630393E9|            Infinity|             17963.0|   null|
| stddev|  1536.7533549608793| 26.20654052137011|   null|        NaN|              null|1.203053747233398

                                                                                

In [67]:
data_df = spark_session.read.option('escape', '"').csv(
    'hdfs://localhost:9900/user/book_reviews/books_data.csv', header=True, inferSchema=True)
data_df.printSchema()
data_df.describe().show()
data_df.show(5)


root
 |-- Title: string (nullable = true)
 |-- description: string (nullable = true)
 |-- authors: string (nullable = true)
 |-- image: string (nullable = true)
 |-- previewLink: string (nullable = true)
 |-- publisher: string (nullable = true)
 |-- publishedDate: string (nullable = true)
 |-- infoLink: string (nullable = true)
 |-- categories: string (nullable = true)
 |-- ratingsCount: double (nullable = true)
 |-- Table: string (nullable = true)





+-------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------------------+--------------------+--------------------+------------------+------+
|summary|               Title|         description|             authors|               image|         previewLink|           publisher|     publishedDate|            infoLink|          categories|      ratingsCount| Table|
+-------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------------------+--------------------+--------------------+------------------+------+
|  count|              212403|              143962|              180991|              160329|              188568|              136518|            187099|              188568|              171205|             49752|212404|
|   mean|   3823.672941176471|  1.4285714285714286|                null|                null|               

                                                                                

In [68]:
spark_session.stop()