## Data Cleaning with spark
### in this notebook the useless columns will be removed

### **PLEASE NOTE :**  
### Since this script stores the results in hadoop, execute it only once, otherwise an error will be thrown

---

### Import Libraries

In [29]:
# import libraries
import findspark

# Locate the spark installation
findspark.init()

In [30]:
import pandas as pd
import pyspark as ps
from pyspark.sql.functions import col, sum
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
from pyspark.sql import SparkSession


### Initialize Spark

In [31]:
# Initialize a SparkContext
spark = SparkSession.builder.appName("prior_analysis").getOrCreate()
spark.stop()
sc = ps.SparkContext(appName="prior_analysis")

# Initialize the Session
spark_session = ps.sql.SparkSession(sc)


### Connect and import data from HDFS directly into a Spark DataFrame

In [32]:
# Define schema for better manipulation

data_schema = StructType([
    StructField("Title", StringType(), True),
    StructField("description", StringType(), True),
    StructField("authors", StringType(), True),
    StructField("image", StringType(), True),
    StructField("previewLink", StringType(), True),
    StructField("publisher", StringType(), True),
    StructField("publishedDate", StringType(), True),
    StructField("infoLink", StringType(), True),
    StructField("categories", StringType(), True),
    StructField("ratingsCount", FloatType(), True)
])

ratings_schema = StructType([
    StructField("Id", IntegerType(), True),
    StructField("Title", StringType(), True),
    StructField("Price", FloatType(), True),
    StructField("User_id", IntegerType(), True),
    StructField("profileName", StringType(), True),
    StructField("review/helpfulness", StringType(), True),
    StructField("review/score", FloatType(), True),
    StructField("review/time", IntegerType(), True),
    StructField("review/summary", StringType(), True),
    StructField("review/text", StringType(), True)
])

# Load the data

df_data = spark_session.read.option('escape', '"').csv(
    'hdfs://localhost:9900/user/book_reviews/books_data.csv', header=True, schema=data_schema)
df_ratings = spark_session.read.option('escape', '"').csv(
    'hdfs://localhost:9900/user/book_reviews/books_rating.csv', header=True, schema=ratings_schema)

### Remove useless columns

In [33]:
df_ratings=df_ratings.drop(df_ratings.Id)
df_ratings.show(5)
df_ratings.write.csv('hdfs://localhost:9900/user/book_reviews/books_rating_clean.csv',mode='overwrite', header=True)

+--------------------+-----+-------+--------------------+------------------+------------+-----------+--------------------+--------------------+
|               Title|Price|User_id|         profileName|review/helpfulness|review/score|review/time|      review/summary|         review/text|
+--------------------+-----+-------+--------------------+------------------+------------+-----------+--------------------+--------------------+
|Its Only Art If I...| null|   null|Jim of Oz "jim-of...|               7/7|         4.0|  940636800|Nice collection o...|This is only for ...|
|Dr. Seuss: Americ...| null|   null|       Kevin Killian|             10/10|         5.0| 1095724800|   Really Enjoyed It|I don't care much...|
|Dr. Seuss: Americ...| null|   null|        John Granger|             10/11|         5.0| 1078790400|Essential for eve...|If people become ...|
|Dr. Seuss: Americ...| null|   null|Roy E. Perry "ama...|               7/7|         4.0| 1090713600|Phlip Nel gives s...|Theodore Seuss

                                                                                

### Check whether the columns has been correctly removed

In [34]:
ratings_df = spark_session.read.option('escape','"').csv('hdfs://localhost:9900/user/book_reviews/books_rating_clean.csv', header=True, inferSchema=True)
ratings_df.printSchema()
ratings_df.describe().show()
ratings_df.show(5)

                                                                                

root
 |-- Title: string (nullable = true)
 |-- Price: string (nullable = true)
 |-- User_id: string (nullable = true)
 |-- profileName: string (nullable = true)
 |-- review/helpfulness: string (nullable = true)
 |-- review/score: string (nullable = true)
 |-- review/time: string (nullable = true)
 |-- review/summary: string (nullable = true)
 |-- review/text: string (nullable = true)





+-------+--------------------+--------------------+--------------------+-----------------+-------------------+--------------------+--------------------+--------------------+--------------------+
|summary|               Title|               Price|             User_id|      profileName| review/helpfulness|        review/score|         review/time|      review/summary|         review/text|
+-------+--------------------+--------------------+--------------------+-----------------+-------------------+--------------------+--------------------+--------------------+--------------------+
|  count|             2999792|              482638|                1122|          2436873|            2999074|             2999696|             2999736|             2999722|             2999721|
|   mean|   2012.796651763537|   21.76817700153379|   15.84792915531335|              NaN|4.690351793452941E8|  1657.0094941017894|1.1270489306820295E9|            Infinity|  9.95368319174848E8|
| stddev|    1536.7533549

                                                                                

In [35]:
spark_session.stop()