### Objective:

- Save and retrieve processed data efficiently inside Dataproc.
- Serve data in a structured way for analysis.
- Use Parquet, Hive, and CSV 

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder \
.appName('Olist Ecommerce Performance Optmization') \
.config('spark.executor.memory','6g') \
.config('spark.executor.cores','4') \
.config('spark.executor.instances','2') \
.config('spark.driver.memory','4g') \
.config('spark.driver.maxResultSize','2g') \
.config('spark.sql.shuffle.partitions','64') \
.config('spark.default.parallelism','64') \
.config('spark.sql.adaptive.enabled','true') \
.config('spark.sql.adaptive.coalescePartition.enabled','true') \
.config('spark.sql.autoBroadcastJoinThreshold',20*1024*1024) \
.config('spark.sql.files.maxPartitionBytes','64MB') \
.config('spark.sql.files.openCostInBytes','2MB') \
.config('spark.memory.fraction',0.8) \
.config('spark.memory.storageFraction',0.2) \
.getOrCreate()

25/09/20 15:45:23 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [4]:
full_orders_df = spark.read.parquet('/data/olist/processed/')

                                                                                

In [5]:
full_orders_df.printSchema()

root
 |-- customer_id: string (nullable = true)
 |-- order_id: string (nullable = true)
 |-- seller_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- order_status: string (nullable = true)
 |-- order_purchase_timestamp: timestamp (nullable = true)
 |-- order_approved_at: timestamp (nullable = true)
 |-- order_delivered_carrier_date: timestamp (nullable = true)
 |-- order_delivered_customer_date: timestamp (nullable = true)
 |-- order_estimated_delivery_date: timestamp (nullable = true)
 |-- order_item_id: integer (nullable = true)
 |-- shipping_limit_date: timestamp (nullable = true)
 |-- price: double (nullable = true)
 |-- freight_value: double (nullable = true)
 |-- product_category_name: string (nullable = true)
 |-- product_name_lenght: integer (nullable = true)
 |-- product_description_lenght: integer (nullable = true)
 |-- product_photos_qty: integer (nullable = true)
 |-- product_weight_g: integer (nullable = true)
 |-- product_length_cm: integer (null

In [6]:
# save as Parquet in hdfs

full_orders_df.write.mode('overwrite').parquet('/data/olist/proc')

25/09/20 15:45:30 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

In [None]:
# Save is as a parquet in Google cloud storage

full_orders_df.write.mode('overwrite').parquet('gs://dataproc-staging-us-central1-870618479357-obiyadgs/temp_data')



In [None]:
full_orders_df.write.mode('overwrite').saveAsTable('full_order_detail')

In [None]:
spark.sql('show tables')

In [None]:
full_orders_df.write.mode('overwrite').option('header','true').csv('/data/olist/proc/')