In [1]:
import findspark

In [2]:
findspark.init("/opt/manual/spark")

In [3]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.types import *

In [4]:
spark = SparkSession.builder \
.appName("Columns") \
.master("local[2]") \
.config("spark.driver.memory","1500m") \
.config("spark.executer.memory", "2g") \
.getOrCreate()

In [6]:
df = spark.read \
.option("header",True) \
.option("inferSchema", True) \
.option("compression", "gzip") \
.csv("file:///home/train/datasets/Hotel_Reviews.csv.gz")

In [7]:
df2 = df.withColumn("Tags",
                    F.split(F.col("Tags"), ",")
                    .cast(ArrayType(StringType()))) \
.withColumn("Review_Date", F.to_date(F.col("Review_Date"), "M/d/yyyy"))

In [8]:
df2.limit(2).toPandas()

Unnamed: 0,Hotel_Address,Additional_Number_of_Scoring,Review_Date,Average_Score,Hotel_Name,Reviewer_Nationality,Negative_Review,Review_Total_Negative_Word_Counts,Total_Number_of_Reviews,Positive_Review,Review_Total_Positive_Word_Counts,Total_Number_of_Reviews_Reviewer_Has_Given,Reviewer_Score,Tags,days_since_review,lat,lng
0,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,2017-08-03,7.7,Hotel Arena,Russia,I am so angry that i made this post available...,397,1403,Only the park outside of the hotel was beauti...,11,7,2.9,"[[' Leisure trip ', ' Couple ', ' Duplex Dou...",0 days,52.3605759,4.9159683
1,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,2017-08-03,7.7,Hotel Arena,Ireland,No Negative,0,1403,No real complaints the hotel was great great ...,105,7,7.5,"[[' Leisure trip ', ' Couple ', ' Duplex Dou...",0 days,52.3605759,4.9159683


## Explain Plan

In [9]:
df3 = df2.select("Review_Date","Average_Score", "Reviewer_Nationality") \
.withColumn("Year", F.year("Review_Date")) \
.groupBy("Year", "Reviewer_Nationality") \
.agg(F.sum("Average_Score").alias("Total_Avg_Score"))

In [10]:
df3.explain(True)

== Parsed Logical Plan ==
'Aggregate ['Year, 'Reviewer_Nationality], [unresolvedalias('Year, None), unresolvedalias('Reviewer_Nationality, None), sum('Average_Score) AS Total_Avg_Score#166]
+- Project [Review_Date#118, Average_Score#69, Reviewer_Nationality#71, year(Review_Date#118) AS Year#156]
   +- Project [Review_Date#118, Average_Score#69, Reviewer_Nationality#71]
      +- Project [Hotel_Address#66, Additional_Number_of_Scoring#67, to_date('Review_Date, Some(M/d/yyyy)) AS Review_Date#118, Average_Score#69, Hotel_Name#70, Reviewer_Nationality#71, Negative_Review#72, Review_Total_Negative_Word_Counts#73, Total_Number_of_Reviews#74, Positive_Review#75, Review_Total_Positive_Word_Counts#76, Total_Number_of_Reviews_Reviewer_Has_Given#77, Reviewer_Score#78, Tags#100, days_since_review#80, lat#81, lng#82]
         +- Project [Hotel_Address#66, Additional_Number_of_Scoring#67, Review_Date#68, Average_Score#69, Hotel_Name#70, Reviewer_Nationality#71, Negative_Review#72, Review_Total_Negati

In [11]:
spark.stop()