In [1]:
import findspark

In [2]:
findspark.init("/opt/manual/spark")

In [3]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.types import  *

In [4]:
spark = SparkSession.builder \
.appName("DataframeWriter Json and Csv") \
.master("yarn") 
.enableHiveSupport() \
.getOrCreate()

In [5]:
df = spark.read \
.format("csv") \
.option("header",True) \
.option("inferSchema", True) \
.option("compression","gzip") \
.load("/user/train/datasets/Hotel_Reviews.csv.gz")

In [6]:
df2 = df.withColumn("Tags", 
                     F.split(F.col("Tags"), ",")
                     .cast(ArrayType(StringType()))) \
.withColumn("Review_Date", F.to_date(F.col("Review_Date"),"M/d/yyyy"))

In [7]:
df2.limit(2).toPandas()

Unnamed: 0,Hotel_Address,Additional_Number_of_Scoring,Review_Date,Average_Score,Hotel_Name,Reviewer_Nationality,Negative_Review,Review_Total_Negative_Word_Counts,Total_Number_of_Reviews,Positive_Review,Review_Total_Positive_Word_Counts,Total_Number_of_Reviews_Reviewer_Has_Given,Reviewer_Score,Tags,days_since_review,lat,lng
0,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,2017-08-03,7.7,Hotel Arena,Russia,I am so angry that i made this post available...,397,1403,Only the park outside of the hotel was beauti...,11,7,2.9,"[[' Leisure trip ', ' Couple ', ' Duplex Dou...",0 days,52.3605759,4.9159683
1,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,2017-08-03,7.7,Hotel Arena,Ireland,No Negative,0,1403,No real complaints the hotel was great great ...,105,7,7.5,"[[' Leisure trip ', ' Couple ', ' Duplex Dou...",0 days,52.3605759,4.9159683


In [8]:
df2.printSchema()

root
 |-- Hotel_Address: string (nullable = true)
 |-- Additional_Number_of_Scoring: integer (nullable = true)
 |-- Review_Date: date (nullable = true)
 |-- Average_Score: double (nullable = true)
 |-- Hotel_Name: string (nullable = true)
 |-- Reviewer_Nationality: string (nullable = true)
 |-- Negative_Review: string (nullable = true)
 |-- Review_Total_Negative_Word_Counts: integer (nullable = true)
 |-- Total_Number_of_Reviews: integer (nullable = true)
 |-- Positive_Review: string (nullable = true)
 |-- Review_Total_Positive_Word_Counts: integer (nullable = true)
 |-- Total_Number_of_Reviews_Reviewer_Has_Given: integer (nullable = true)
 |-- Reviewer_Score: double (nullable = true)
 |-- Tags: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- days_since_review: string (nullable = true)
 |-- lat: string (nullable = true)
 |-- lng: string (nullable = true)



In [9]:
import time

# json

In [10]:

start_time = time.time()

df2.write \
.format("json") \
.mode("overwrite") \
.save("hdfs://localhost:9000/user/train/output_data/hotel_review_json")


print("----------- %s secs ----------" %(time.time() - start_time))

----------- 28.398805618286133 secs ----------


In [11]:
! hdfs dfs -ls -h /user/train/output_data/hotel_review_json

Found 2 items
-rw-r--r--   1 train supergroup          0 2020-12-13 11:20 /user/train/output_data/hotel_review_json/_SUCCESS
-rw-r--r--   1 train supergroup    413.0 M 2020-12-13 11:20 /user/train/output_data/hotel_review_json/part-00000-06357188-183e-4e4b-81ed-6f0792dc5700-c000.json


# json with compression

In [13]:
start_time = time.time()

df2.write \
.format("json") \
.mode("overwrite") \
.option("compression","gzip") \
.save("hdfs://localhost:9000/user/train/output_data/hotel_review_json_compress")


print("----------- %s secs ----------" %(time.time() - start_time))

----------- 44.820574045181274 secs ----------


In [14]:
! hdfs dfs -ls -h /user/train/output_data/hotel_review_avro_compress

Found 2 items
-rw-r--r--   1 train supergroup          0 2020-12-13 08:15 /user/train/output_data/hotel_review_avro_compress/_SUCCESS
-rw-r--r--   1 train supergroup     41.9 M 2020-12-13 08:15 /user/train/output_data/hotel_review_avro_compress/part-00000-822aede2-ae39-4be5-8dec-f958b156c714-c000.avro


# csv

In [15]:
from pyspark.sql.types import StringType
start_time = time.time()

df2.withColumn("Tags", F.col("Tags").cast(StringType())) \
.write \
.format("csv") \
.mode("overwrite") \
.save("hdfs://localhost:9000/user/train/output_data/hotel_review_csv")


print("----------- %s secs ----------" %(time.time() - start_time))

----------- 27.801115036010742 secs ----------


In [16]:
! hdfs dfs -ls -h /user/train/output_data/hotel_review_csv

Found 2 items
-rw-r--r--   1 train supergroup          0 2020-12-13 11:25 /user/train/output_data/hotel_review_csv/_SUCCESS
-rw-r--r--   1 train supergroup    228.0 M 2020-12-13 11:25 /user/train/output_data/hotel_review_csv/part-00000-23b6f207-b253-4cc6-a2b8-268cbbb9e427-c000.csv


# csv compress

In [18]:
start_time = time.time()

df2.withColumn("Tags", F.col("Tags").cast(StringType())) \
.write \
.format("csv") \
.option("compression","gzip") \
.mode("overwrite") \
.save("hdfs://localhost:9000/user/train/output_data/hotel_review_csv_compress")


print("----------- %s secs ----------" %(time.time() - start_time))

----------- 44.98415207862854 secs ----------


In [19]:
! hdfs dfs -ls -h /user/train/output_data/hotel_review_csv_compress

Found 2 items
-rw-r--r--   1 train supergroup          0 2020-12-13 11:27 /user/train/output_data/hotel_review_csv_compress/_SUCCESS
-rw-r--r--   1 train supergroup     44.2 M 2020-12-13 11:27 /user/train/output_data/hotel_review_csv_compress/part-00000-1fac1ed2-b3ba-43af-bb49-34c9fc70b88c-c000.csv.gz


In [20]:
spark.stop()