In [1]:
! rm -rf ~/pyspark_output_data/*

In [2]:
! ls -l ~/pyspark_output_data/

total 0


In [3]:
import findspark

In [4]:
# /opt/manual/spark: this is SPARK_HOME path
findspark.init("/opt/manual/spark")

In [5]:
from pyspark.sql import SparkSession, functions as F

In [6]:
spark = SparkSession.builder \
.appName("Writing dataframe to disk") \
.master("local[2]") \
.getOrCreate()

2022-01-01 11:01:17,699 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [7]:
# Data source: https://www.kaggle.com/jiashenliu/515k-hotel-reviews-data-in-europe

In [8]:
#! wget -P ~/datasets \
#https://github.com/erkansirin78/datasets/raw/master/Hotel_Reviews.csv.gz

In [9]:
! ls -l ~/datasets | grep Hotel

-rw-rw-r--. 1 train train 46401315 Dec 30 23:34 Hotel_Reviews.csv.gz


In [10]:
# define a manual schema 

from pyspark.sql.types import *

programmatical_schema = StructType([
        StructField("Hotel_Address",StringType(),True),
        StructField("Additional_Number_of_Scoring",IntegerType(),True),
        StructField("Review_Date",StringType(),True),
        StructField("Average_Score",FloatType(),True),
        StructField("Hotel_Name",StringType(),True),
        StructField("Reviewer_Nationality",StringType(),True),
        StructField("Negative_Review",StringType(),True),
        StructField("Review_Total_Negative_Word_Counts",IntegerType(),True),
        StructField("Total_Number_of_Reviews",IntegerType(),True),
        StructField("Positive_Review",StringType(),True),
        StructField("Review_Total_Positive_Word_Counts",IntegerType(),True),
        StructField("Total_Number_of_Reviews_Reviewer_Has_Given",IntegerType(),True),
        StructField("Reviewer_Score",FloatType(),True),
        StructField("Tags",StringType(),True),
        StructField("days_since_review",StringType(),True),
        StructField("lat",FloatType(),True),
        StructField("lng",FloatType(),True)
    ])

# StructField("Tags",ArrayType(StringType()),True)
# Actually Tags should be array but csv cannot store array type.
# So you have to define it as StringType 

# Review_Date is still StringType() and should be DateType() 
# But for the moment we intentioally leave it StringType()
# As soon as we put schema on data we will modify it.

In [11]:
df = spark.read.option("header", True) \
.schema(programmatical_schema) \
.option("compression","gzip") \
.csv("file:///home/train/datasets/Hotel_Reviews.csv.gz")

In [12]:
# Now we can correct Tags datatype
# But we have to do additional preperation before cast with split.
df2 = df.withColumn("Tags", 
                     F.split(F.col("Tags"), ",")
                     .cast(ArrayType(StringType()))) \
.withColumn("Review_Date", F.to_date(F.col("Review_Date"),"M/d/yyyy"))

In [13]:
df2.limit(5).toPandas()

                                                                                

Unnamed: 0,Hotel_Address,Additional_Number_of_Scoring,Review_Date,Average_Score,Hotel_Name,Reviewer_Nationality,Negative_Review,Review_Total_Negative_Word_Counts,Total_Number_of_Reviews,Positive_Review,Review_Total_Positive_Word_Counts,Total_Number_of_Reviews_Reviewer_Has_Given,Reviewer_Score,Tags,days_since_review,lat,lng
0,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,2017-08-03,7.7,Hotel Arena,Russia,I am so angry that i made this post available...,397,1403,Only the park outside of the hotel was beauti...,11,7,2.9,"[[' Leisure trip ', ' Couple ', ' Duplex Dou...",0 days,52.360577,4.915968
1,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,2017-08-03,7.7,Hotel Arena,Ireland,No Negative,0,1403,No real complaints the hotel was great great ...,105,7,7.5,"[[' Leisure trip ', ' Couple ', ' Duplex Dou...",0 days,52.360577,4.915968
2,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,2017-07-31,7.7,Hotel Arena,Australia,Rooms are nice but for elderly a bit difficul...,42,1403,Location was good and staff were ok It is cut...,21,9,7.1,"[[' Leisure trip ', ' Family with young child...",3 days,52.360577,4.915968
3,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,2017-07-31,7.7,Hotel Arena,United Kingdom,My room was dirty and I was afraid to walk ba...,210,1403,Great location in nice surroundings the bar a...,26,1,3.8,"[[' Leisure trip ', ' Solo traveler ', ' Dup...",3 days,52.360577,4.915968
4,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,2017-07-24,7.7,Hotel Arena,New Zealand,You When I booked with your company on line y...,140,1403,Amazing location and building Romantic setting,8,3,6.7,"[[' Leisure trip ', ' Couple ', ' Suite ', ...",10 days,52.360577,4.915968


In [14]:
df2.printSchema()

root
 |-- Hotel_Address: string (nullable = true)
 |-- Additional_Number_of_Scoring: integer (nullable = true)
 |-- Review_Date: date (nullable = true)
 |-- Average_Score: float (nullable = true)
 |-- Hotel_Name: string (nullable = true)
 |-- Reviewer_Nationality: string (nullable = true)
 |-- Negative_Review: string (nullable = true)
 |-- Review_Total_Negative_Word_Counts: integer (nullable = true)
 |-- Total_Number_of_Reviews: integer (nullable = true)
 |-- Positive_Review: string (nullable = true)
 |-- Review_Total_Positive_Word_Counts: integer (nullable = true)
 |-- Total_Number_of_Reviews_Reviewer_Has_Given: integer (nullable = true)
 |-- Reviewer_Score: float (nullable = true)
 |-- Tags: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- days_since_review: string (nullable = true)
 |-- lat: float (nullable = true)
 |-- lng: float (nullable = true)



# Write to disk

In [16]:
import time

# CSV

In [15]:
! ls -l ~/pyspark_output_data/

total 0


In [17]:
df2.write \
.format("csv") \
.mode("overwrite") \
.save("file:///home/train/venvspark/dev/output_data/hotel_reviews_csv")

AnalysisException: CSV data source does not support array<string> data type.

In [18]:
# workaround-1 (loses data)
df2.drop("Tags").write \
.format("csv") \
.mode("overwrite") \
.save("file:///home/train/pyspark_output_data/hotel_reviews_csv")

                                                                                

In [19]:
! ls -lh ~/pyspark_output_data/

total 0
drwxr-xr-x. 2 train train 176 Jan  1 11:06 hotel_reviews_csv


In [20]:
! ls -lh ~/pyspark_output_data/hotel_reviews_csv

total 173M
-rw-r--r--. 1 train train 173M Jan  1 11:06 part-00000-0af01ce7-3064-4af7-9a8a-21567096341d-c000.csv
-rw-r--r--. 1 train train    0 Jan  1 11:06 _SUCCESS


In [21]:
# workaround-2

start_time = time.time()

from pyspark.sql.types import StringType

df2.withColumn("Tags", F.col("Tags").cast(StringType())) \
.write \
.format("csv") \
.mode("overwrite") \
.save("file:///home/train/pyspark_output_data/hotel_reviews_csv2")

print("--- %s seconds ---" % (time.time() - start_time))

[Stage 2:>                                                          (0 + 1) / 1]

--- 26.761907815933228 seconds ---


                                                                                

In [22]:
 ls -l ~/pyspark_output_data/

total 0
drwxr-xr-x. 2 train train 176 Jan  1 11:06 [0m[38;5;27mhotel_reviews_csv[0m/
drwxr-xr-x. 2 train train 176 Jan  1 11:09 [38;5;27mhotel_reviews_csv2[0m/


In [23]:
! ls -lh ~/pyspark_output_data/hotel_reviews_csv2

total 228M
-rw-r--r--. 1 train train 228M Jan  1 11:09 part-00000-baa8c657-e8b0-498a-af45-c89f124813be-c000.csv
-rw-r--r--. 1 train train    0 Jan  1 11:09 _SUCCESS


In [24]:
spark.stop()