In [None]:
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder \
.appName('Olist Ecommerce Performance Optimization') \
.config('spark.executor.memory','6g') \
.config('spark.executor.cores','4') \
.config('spark.executor.instances','2') \
.config('spark.driver.memory','4g') \
.config('spark.driver.maxResultSize','2g') \
.config('spark.sql.shuffle.partitions','64') \
.config('spark.default.parallelism','64') \
.config('spark.sql.adaptive.enabled','true') \
.config('spark.sql.adaptive.coalescePartition.enabled','true') \
.config('spark.sql.autoBroadcastJoinThreshold',20*1024*1024) \
.config('spark.sql.files.maxPartitionBytes','64MB') \
.config('spark.sql.files.openCostInBytes','2MB') \
.config('spark.memory.fraction',0.8) \
.config('spark.memory.storageFraction',0.2) \
.getOrCreate()



In [None]:
hdfs_path = '/data/olist/'

In [None]:
customers_df = spark.read.csv(hdfs_path + 'olist_customers_dataset.csv', header = True, inferSchema = True)
geolocation_df = spark.read.csv(hdfs_path + 'olist_geolocation_dataset.csv', header = True, inferSchema = True)
order_items_df = spark.read.csv(hdfs_path + 'olist_order_items_dataset.csv', header = True, inferSchema = True)
payments_df = spark.read.csv(hdfs_path + 'olist_order_payments_dataset.csv', header = True, inferSchema = True)
reviews_df = spark.read.csv(hdfs_path + 'olist_order_reviews_dataset.csv', header = True, inferSchema = True)
orders_df = spark.read.csv(hdfs_path + 'olist_orders_dataset.csv', header = True, inferSchema = True)
products_df = spark.read.csv(hdfs_path + 'olist_products_dataset.csv', header = True, inferSchema = True)
sellers_df = spark.read.csv(hdfs_path + 'olist_sellers_dataset.csv', header = True, inferSchema = True)
category_translation_df = spark.read.csv(hdfs_path + 'product_category_name_translation.csv', header = True, inferSchema = True)

In [None]:
full_orders_df = spark.read.parquet('/olist/processed/')

In [None]:
full_orders_df.printSchema()

## Otimized Join Strategies

In [None]:
#Broadcasting

customers_broadcast_df = broadcast(customers_df)

In [None]:
# Sort and Merge Join

sorted_customers_df = customers_df.sortWithinPartitions('customer_id')
sorted_orders_df = full_orders_df.sortWithinPartitions('customer_id')

optimized_merged_full_orders_df = sorted_orders_df.join(sorted_customers_df, 'customer_id')

In [None]:
# Bucket Join

bucketed_customers_df = customers_df.repartition(10'customer_id')
bucketed_orders_df = full_orders_df.sortWithinPartitions(10'customer_id')

bucket_join_df = bucketed_orders_df.join(bucketed_customers_df,'customer_id')

## Data Serving Layer

In [None]:
!hadoop fs -mkdir /data/olist/proc

In [None]:
# Save as Parquet in HDFS

full_orders_df.write.mode('overwrite').parquet('/olist/proc/')

In [None]:
# Save as Parquet in Google Cloud Storage

full_orders_df.write.mode('overwrite').parquet('gs://dataproc-staging-us-central1-690381932974-stdyoclo/temp_data')

In [None]:
# Save as Table

full_orders_df.write.mode('overwrite').saveAsTable('full_order_detail')

In [None]:
# Save as CSV

full_orders_df.write.mode('overwrite').option('header','true').csv('/olist/proc/')