In [1]:
import findspark

In [2]:
# /opt/manual/spark: this is SPARK_HOME path
findspark.init("/opt/manual/spark")

In [3]:
from pyspark.sql import SparkSession, functions as F

In [4]:
spark = SparkSession.builder \
.appName("Review Execution Plan") \
.master("local[2]") \
.getOrCreate()

2022-08-27 20:05:06,036 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [5]:
# Data source: https://www.kaggle.com/jiashenliu/515k-hotel-reviews-data-in-europe

In [6]:
#! wget -P ~/datasets \
#https://github.com/erkansirin78/datasets/raw/master/Hotel_Reviews.csv.gz

In [7]:
! ls -l ~/datasets | grep Hotel

-rw-rw-r--. 1 train train 46401315 Aug 27 11:25 Hotel_Reviews.csv.gz


In [8]:
from pyspark.sql.types import *

programmatical_schema = StructType([
        StructField("Hotel_Address",StringType(),True),
        StructField("Additional_Number_of_Scoring",IntegerType(),True),
        StructField("Review_Date",StringType(),True),
        StructField("Average_Score",FloatType(),True),
        StructField("Hotel_Name",StringType(),True),
        StructField("Reviewer_Nationality",StringType(),True),
        StructField("Negative_Review",StringType(),True),
        StructField("Review_Total_Negative_Word_Counts",IntegerType(),True),
        StructField("Total_Number_of_Reviews",IntegerType(),True),
        StructField("Positive_Review",StringType(),True),
        StructField("Review_Total_Positive_Word_Counts",IntegerType(),True),
        StructField("Total_Number_of_Reviews_Reviewer_Has_Given",IntegerType(),True),
        StructField("Reviewer_Score",FloatType(),True),
        StructField("Tags",StringType(),True),
        StructField("days_since_review",StringType(),True),
        StructField("lat",FloatType(),True),
        StructField("lng",FloatType(),True)
    ])

# StructField("Tags",ArrayType(StringType()),True)
# Actually Tags should be array but csv cannot store array type.
# So you have to define it as StringType 

# Review_Date is still StringType() and should be DateType() 
# But for the moment we intentioally leave it StringType()
# As soon as we put schema on data we will modify it.

In [9]:
df = spark.read.option("header", True) \
.schema(programmatical_schema) \
.option("compression","gzip") \
.csv("file:///home/train/datasets/Hotel_Reviews.csv.gz")

In [10]:
# Now we can correct Tags datatype
# But we have to do additional preperation before cast with split.
# And cast Review_Date to date
df2 = df.withColumn("Tags", 
                     F.split(F.col("Tags"), ",")
                     .cast(ArrayType(StringType()))) \
.withColumn("Review_Date", F.to_date(F.col("Review_Date"),"M/d/yyyy"))

In [11]:
# What if we want to day names instead of day number
df2.select('Review_Date', F.date_format('Review_Date', 'E').alias('Day_of_Review'), 
           'Reviewer_Score') \
.groupBy('Day_of_Review') \
.agg(F.count('*').alias("Total_Reviews"), F.avg('Reviewer_Score').alias("AVG_Reviewer_Score")) \
.orderBy(F.desc('AVG_Reviewer_Score')) \
.show()

                                                                                

+-------------+-------------+------------------+
|Day_of_Review|Total_Reviews|AVG_Reviewer_Score|
+-------------+-------------+------------------+
|          Tue|       120948| 8.444183518088053|
|          Wed|        58591| 8.405661343067354|
|          Mon|        81145| 8.392337241050269|
|          Sun|        83981| 8.390862285826119|
|          Fri|        44732| 8.373535788875781|
|          Sat|        51833| 8.371462261789713|
|          Thu|        74508| 8.344132239579093|
+-------------+-------------+------------------+



<img src="../images/spark_computaion_journey.png"/>

<p>Source: Learning Spark, O'Reilly, 2020</p>

<img src="../images/spark_example_query_plan.png"/>

<p>Source: Learning Spark, O'Reilly, 2020 </p>

In [12]:
df3 = df2.select('Review_Date', F.date_format('Review_Date', 'E').alias('Day_of_Review'), 
           'Reviewer_Score') \
.groupBy('Day_of_Review') \
.agg(F.count('*').alias("Total_Reviews"), F.avg('Reviewer_Score').alias("AVG_Reviewer_Score")) \
.orderBy(F.desc('AVG_Reviewer_Score'))

In [13]:
df3.explain(True)

== Parsed Logical Plan ==
'Sort ['AVG_Reviewer_Score DESC NULLS LAST], true
+- Aggregate [Day_of_Review#105], [Day_of_Review#105, count(1) AS Total_Reviews#113L, avg(cast(Reviewer_Score#12 as double)) AS AVG_Reviewer_Score#115]
   +- Project [Review_Date#52, date_format(cast(Review_Date#52 as timestamp), E, Some(Europe/Istanbul)) AS Day_of_Review#105, Reviewer_Score#12]
      +- Project [Hotel_Address#0, Additional_Number_of_Scoring#1, to_date('Review_Date, Some(M/d/yyyy)) AS Review_Date#52, Average_Score#3, Hotel_Name#4, Reviewer_Nationality#5, Negative_Review#6, Review_Total_Negative_Word_Counts#7, Total_Number_of_Reviews#8, Positive_Review#9, Review_Total_Positive_Word_Counts#10, Total_Number_of_Reviews_Reviewer_Has_Given#11, Reviewer_Score#12, Tags#34, days_since_review#14, lat#15, lng#16]
         +- Project [Hotel_Address#0, Additional_Number_of_Scoring#1, Review_Date#2, Average_Score#3, Hotel_Name#4, Reviewer_Nationality#5, Negative_Review#6, Review_Total_Negative_Word_Counts#7,

In [16]:
spark.stop()