In [1]:
import findspark

In [2]:
findspark.init("/opt/manual/spark")

In [3]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.types import *

In [4]:
spark = (
    SparkSession.builder
    .appName("DataFrameWriter")
    .master("yarn")
    .enableHiveSupport()         
.getOrCreate())

In [6]:
df = spark.read \
.option("header",True) \
.option("inferSchema", True) \
.option("compression", "gzip") \
.csv("/user/train/datasets/Hotel_Reviews.csv.gz")

In [7]:
df2 = df.withColumn("Tags",
                    F.split(F.col("Tags"), ",")
                    .cast(ArrayType(StringType()))) \
.withColumn("Review_Date", F.to_date(F.col("Review_Date"), "M/d/yyyy"))

In [8]:
df2.printSchema()

root
 |-- Hotel_Address: string (nullable = true)
 |-- Additional_Number_of_Scoring: integer (nullable = true)
 |-- Review_Date: date (nullable = true)
 |-- Average_Score: double (nullable = true)
 |-- Hotel_Name: string (nullable = true)
 |-- Reviewer_Nationality: string (nullable = true)
 |-- Negative_Review: string (nullable = true)
 |-- Review_Total_Negative_Word_Counts: integer (nullable = true)
 |-- Total_Number_of_Reviews: integer (nullable = true)
 |-- Positive_Review: string (nullable = true)
 |-- Review_Total_Positive_Word_Counts: integer (nullable = true)
 |-- Total_Number_of_Reviews_Reviewer_Has_Given: integer (nullable = true)
 |-- Reviewer_Score: double (nullable = true)
 |-- Tags: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- days_since_review: string (nullable = true)
 |-- lat: string (nullable = true)
 |-- lng: string (nullable = true)



# Parquet

In [9]:
import time 

start_time = time.time()

df2.write \
.format("parquet") \
.mode("overwrite") \
.save("hdfs://localhost:9000/user/train/output_data/hotel_review_parquet")

print("%s secs = "% (time.time() - start_time))

30.173896074295044 secs = 


In [10]:
! hdfs dfs -ls /user/train/output_data

Found 1 items
drwxr-xr-x   - train supergroup          0 2025-06-12 23:08 /user/train/output_data/hotel_review_parquet


# Parquet with compression

In [11]:
import time 

start_time = time.time()

df2.write \
.format("parquet") \
.mode("overwrite") \
.option("compression", "snappy") \
.save("hdfs://localhost:9000/user/train/output_data/hotel_review_parquet_snappy")

print("---------%s secs ---------"% (time.time() - start_time))

---------28.255491971969604 secs ---------


In [12]:
! hdfs dfs -ls /user/train/output_data/hotel_review_parquet_snappy

Found 2 items
-rw-r--r--   1 train supergroup          0 2025-06-12 23:14 /user/train/output_data/hotel_review_parquet_snappy/_SUCCESS
-rw-r--r--   1 train supergroup   60302157 2025-06-12 23:14 /user/train/output_data/hotel_review_parquet_snappy/part-00000-1929b3ca-37f5-44ce-9c8d-fbd680dc062c-c000.snappy.parquet
