## Data Cleaning with spark
### in this notebook the useless columns will be removed

### **PLEASE NOTE :**  
### Since this script stores the results in hadoop, execute it only once, otherwise an error will be thrown

---

### Import Libraries

In [1]:
# import libraries
import findspark

# Locate the spark installation
findspark.init()

In [2]:
import pandas as pd
import pyspark as ps
from pyspark.sql.functions import col, sum
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
from pyspark.sql import SparkSession


### Initialize Spark

In [3]:
# Initialize a SparkContext
spark = SparkSession.builder.appName("prior_analysis").getOrCreate()
spark.stop()
sc = ps.SparkContext(appName="prior_analysis")

# Initialize the Session
spark_session = ps.sql.SparkSession(sc)


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/09/04 14:10:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/09/04 14:10:59 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/09/04 14:11:00 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


### Connect and import data from HDFS directly into a Spark DataFrame

In [4]:
# Define schema for better manipulation

data_schema = StructType([
    StructField("Title", StringType(), True),
    StructField("description", StringType(), True),
    StructField("authors", StringType(), True),
    StructField("image", StringType(), True),
    StructField("previewLink", StringType(), True),
    StructField("publisher", StringType(), True),
    StructField("publishedDate", StringType(), True),
    StructField("infoLink", StringType(), True),
    StructField("categories", StringType(), True),
    StructField("ratingsCount", FloatType(), True)
])

ratings_schema = StructType([
    StructField("Id", IntegerType(), True),
    StructField("Title", StringType(), True),
    StructField("Price", FloatType(), True),
    StructField("User_id", IntegerType(), True),
    StructField("profileName", StringType(), True),
    StructField("review/helpfulness", StringType(), True),
    StructField("review/score", FloatType(), True),
    StructField("review/time", IntegerType(), True),
    StructField("review/summary", StringType(), True),
    StructField("review/text", StringType(), True)
])

Expected: Title but found: Price

# Load the data

df_data = spark_session.read.option('escape', '"').csv(
    'hdfs://localhost:9900/user/book_reviews/books_data.csv', header=True, schema=data_schema)
df_ratings = spark_session.read.option('escape', '"').csv(
    'hdfs://localhost:9900/user/book_reviews/books_rating.csv', header=True, schema=ratings_schema)

SyntaxError: invalid syntax (3651720702.py, line 29)

### Remove useless columns

In [None]:
df_ratings.drop(df_ratings.Id)


DataFrame[Title: string, Price: float, User_id: int, profileName: string, review/helpfulness: string, review/score: float, review/time: int, review/summary: string, review/text: string]

### Check whether the columns has been correctly removed

In [None]:
ratings_df = spark_session.read.option('escape','"').csv('hdfs://localhost:9900/user/book_reviews/books_rating.csv', header=True, inferSchema=True)
ratings_df.printSchema()
ratings_df.describe().show()
ratings_df.show(5)

                                                                                

root
 |-- Id: string (nullable = true)
 |-- Title: string (nullable = true)
 |-- Price: double (nullable = true)
 |-- User_id: string (nullable = true)
 |-- profileName: string (nullable = true)
 |-- review/helpfulness: string (nullable = true)
 |-- review/score: double (nullable = true)
 |-- review/time: integer (nullable = true)
 |-- review/summary: string (nullable = true)
 |-- review/text: string (nullable = true)



23/09/04 14:09:50 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.

+-------+--------------------+--------------------+------------------+--------------------+-----------+------------------+-----------------+-------------------+--------------------+--------------------+
|summary|                  Id|               Title|             Price|             User_id|profileName|review/helpfulness|     review/score|        review/time|      review/summary|         review/text|
+-------+--------------------+--------------------+------------------+--------------------+-----------+------------------+-----------------+-------------------+--------------------+--------------------+
|  count|             3000000|             2999792|            481171|             2438213|    2438114|           3000000|          3000000|            3000000|             2999962|             2999992|
|   mean|1.0568515696607149E9|   2012.796651763537|21.762655874939096|                null|        NaN|              null|4.215289333333334|1.132306772630393E9|            Infinity|       

                                                                                

In [None]:
spark_session.stop()

NameError: name 'spark_session' is not defined