In [2]:
from pyspark.sql import SparkSession

https://spark.apache.org/docs/latest/sql-performance-tuning.html - official documentation


In [2]:
spark = SparkSession.builder\
.appName('Olist Ecommerce Performance Optimization')\
.config('spark.executor.memory','4g')\
.config('spark.executor.cores','2')\
.config('spark.executor.instances','2')\
.config('spark.driver.memory','4g')\
.config('spark.driver.maxResultSize','2g')\
.config('spark.shuffle.partitions','32')\
.config('spark.default.parallelism','32')\
.config('spark.sql.adaptive.enabled','true')\
.config('spark.sql.coalescePartition.enabled','true')\
.config('spark.sql.autoBroadcastJoinThreshold',10*1024*1024)\
.config('spark.sql.files.maxPartitionBytes','32MB')\
.config('spark.sql.files.openCostInBytes','2MB')\
.config('spark.memory.fraction',0.8).config('spark.memory.storageFraction',0.2).getOrCreate()

25/07/26 04:46:45 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [3]:
hdfs_path = '/data/olist/'

In [4]:
customers_df = spark.read.csv(hdfs_path+'olist_customers_dataset.csv',header=True, inferSchema=True)
products_df = spark.read.csv(hdfs_path+'olist_products_dataset.csv',header=True,inferSchema=True)
order_df = spark.read.csv(hdfs_path+'olist_orders_dataset.csv',header = True , inferSchema = True)
item_df = spark.read.csv(hdfs_path+'olist_order_items_dataset.csv',header = True, inferSchema=True)
payment_df = spark.read.csv(hdfs_path+'olist_order_payments_dataset.csv',header = True , inferSchema = True)
reviews_df = spark.read.csv(hdfs_path+'olist_order_reviews_dataset.csv',header = True , inferSchema = True)
location_df = spark.read.csv(hdfs_path+'olist_geolocation_dataset.csv',header = True , inferSchema = True)
seller_df = spark.read.csv(hdfs_path+'olist_sellers_dataset.csv',header = True , inferSchema = True)
trnaslation_df = spark.read.csv(hdfs_path+'product_category_name_translation.csv',header = True , inferSchema = True)

                                                                                

### Optimized Join strategies

In [5]:
!hadoop fs -ls -h /data/processed/

Found 11 items
-rw-r--r--   2 root hadoop          0 2025-07-26 05:18 /data/processed/_SUCCESS
-rw-r--r--   2 root hadoop     31.6 M 2025-07-26 05:18 /data/processed/part-00000-600f2acf-9ab3-4341-b468-6e89bfad091f-c000.snappy.parquet
-rw-r--r--   2 root hadoop     31.2 M 2025-07-26 05:18 /data/processed/part-00001-600f2acf-9ab3-4341-b468-6e89bfad091f-c000.snappy.parquet
-rw-r--r--   2 root hadoop     31.3 M 2025-07-26 05:18 /data/processed/part-00002-600f2acf-9ab3-4341-b468-6e89bfad091f-c000.snappy.parquet
-rw-r--r--   2 root hadoop     30.9 M 2025-07-26 05:18 /data/processed/part-00003-600f2acf-9ab3-4341-b468-6e89bfad091f-c000.snappy.parquet
-rw-r--r--   2 root hadoop     31.5 M 2025-07-26 05:18 /data/processed/part-00004-600f2acf-9ab3-4341-b468-6e89bfad091f-c000.snappy.parquet
-rw-r--r--   2 root hadoop     18.0 M 2025-07-26 05:18 /data/processed/part-00005-600f2acf-9ab3-4341-b468-6e89bfad091f-c000.snappy.parquet
-rw-r--r--   2 root hadoop     18.3 M 2025-07-26 05:18 /data/processed/

In [6]:
full_order_df = spark.read.parquet('/data/processed/')

In [7]:
full_order_df.printSchema()

root
 |-- customer_id: string (nullable = true)
 |-- order_id: string (nullable = true)
 |-- seller_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- order_status: string (nullable = true)
 |-- order_purchase_timestamp: timestamp (nullable = true)
 |-- order_approved_at: timestamp (nullable = true)
 |-- order_delivered_carrier_date: timestamp (nullable = true)
 |-- order_delivered_customer_date: timestamp (nullable = true)
 |-- order_estimated_delivery_date: timestamp (nullable = true)
 |-- order_item_id: integer (nullable = true)
 |-- shipping_limit_date: timestamp (nullable = true)
 |-- price: double (nullable = true)
 |-- freight_value: double (nullable = true)
 |-- product_category_name: string (nullable = true)
 |-- product_name_lenght: integer (nullable = true)
 |-- product_description_lenght: integer (nullable = true)
 |-- product_photos_qty: integer (nullable = true)
 |-- product_weight_g: integer (nullable = true)
 |-- product_length_cm: integer (null

In [9]:
# Broadcast
from pyspark.sql.functions import *

customer_broadcast_df = broadcast(customers_df)
optimized_broadcast_join = full_order_df.join(customer_broadcast_df,'customer_id')


In [10]:
optimized_broadcast_join.show(5)

25/07/26 05:27:48 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
[Stage 20:>                                                         (0 + 1) / 1]

+--------------------+--------------------+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+-------------+-------------------+-----+-------------+---------------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+----------------+----------------------+-----------+------------+--------------------+------------------------+-------------+--------------+---------------------------+-------------------+-------------------+----------------+-----------------+--------------------+------------+--------------------+----------------------+--------------------+-----------------------+------------------+------------+--------------------+-------------+------------+------------+------------------+----------------+-----------+--------------+--------------------+------------------------+-----

                                                                                

## Sort and Merge Join

In [11]:
sorted_customers_df = customers_df.sortWithinPartitions('customer_id')
sorted_orders_df = full_order_df.sortWithinPartitions('customer_id')

In [12]:
optimized_merge_full_order_df = sorted_orders_df.join(sorted_customers_df,'customer_id')

In [13]:
optimized_merge_full_order_df.show(5)

+--------------------+--------------------+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+-------------+-------------------+-----+-------------+---------------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+----------------+----------------------+-----------+------------+--------------------+------------------------+-------------+--------------+---------------------------+-------------------+-------------------+----------------+-----------------+--------------------+------------+--------------------+----------------------+--------------------+-----------------------+------------------+------------+--------------------+-------------+------------+------------+------------------+----------------+-----------+--------------+--------------------+------------------------+-----

# Bucket join - when we have repeated queries and hence data needs to be bucketed.

In [14]:
bucketed_customers_df = customers_df.repartition(10,'customer_id')
bucketed_orders_df = full_order_df.repartition(10,'customer_id')

bucketed_full_order_df = bucketed_orders_df.join(bucketed_customers_df,'customer_id')

In [15]:
bucketed_full_order_df.show(5)



+--------------------+--------------------+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+-------------+-------------------+-----+-------------+---------------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+----------------+----------------------+-----------+------------+--------------------+------------------------+-------------+--------------+---------------------------+-------------------+-------------------+----------------+-----------------+--------------------+------------+--------------------+----------------------+--------------------+-----------------------+------------------+------------+--------------------+-------------+------------+------------+------------------+----------------+-----------+--------------+--------------------+------------------------+-----

                                                                                

### Skew handled join

In [17]:
skew_handked_join = full_order_df.join(customers_df.hint('skew'),'customer_id')

25/07/26 05:52:45 WARN HintErrorLogger: Unrecognized hint: skew()


In [18]:
spark.stop()