# BUCKETING FOR PERFOMANCE OPTIMIZATION

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/28 12:56:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Check Bucketing Enable

In [4]:
spark.conf.get("spark.sql.sources.bucketing.enabled")

'true'

## Sample Data

In [5]:
from pyspark.sql.functions import col, rand

In [6]:
df = spark.range(1, 100000, 1, 10).select(col("id").alias("pk"), rand(10).alias("attribute"))
df.show()

+---+-------------------+
| pk|          attribute|
+---+-------------------+
|  1| 0.1709497137955568|
|  2| 0.8051143958005459|
|  3| 0.5775925576589018|
|  4| 0.9476047869880925|
|  5|    0.2093704977577|
|  6|0.36664222617947817|
|  7| 0.8078688178371882|
|  8| 0.7135143433452461|
|  9| 0.7195325566306053|
| 10|0.31335292311175456|
| 11| 0.8062503712025726|
| 12|0.10814914646176654|
| 13| 0.3362232980701172|
| 14| 0.8133304803837667|
| 15|0.47649428738170896|
| 16|  0.524728096293865|
| 17| 0.9701253460019921|
| 18| 0.6232167713919952|
| 19| 0.5089687568245219|
| 20| 0.5467504094508642|
+---+-------------------+
only showing top 20 rows



                                                                                

In [7]:
df.count()

99999

In [8]:
df.rdd.getNumPartitions()

10

## Create Non-Bucketed Table

In [9]:
df.write.format("parquet").saveAsTable("non_bucketed_table")

                                                                                

In [13]:
%ls -la spark-warehouse/non_bucketed_table

total 1244
drwxr-xr-x. 1 andresmunozpampillon andresmunozpampillon   2822 ago 28 13:05 [0m[01;34m.[0m/
drwxr-xr-x. 1 andresmunozpampillon andresmunozpampillon     64 ago 28 13:07 [01;34m..[0m/
-rw-r--r--. 1 andresmunozpampillon andresmunozpampillon 120799 ago 28 13:05 part-00000-f80bdd04-3e33-42c9-a25d-428cfb027c16-c000.snappy.parquet
-rw-r--r--. 1 andresmunozpampillon andresmunozpampillon    952 ago 28 13:05 .part-00000-f80bdd04-3e33-42c9-a25d-428cfb027c16-c000.snappy.parquet.crc
-rw-r--r--. 1 andresmunozpampillon andresmunozpampillon 120801 ago 28 13:05 part-00001-f80bdd04-3e33-42c9-a25d-428cfb027c16-c000.snappy.parquet
-rw-r--r--. 1 andresmunozpampillon andresmunozpampillon    952 ago 28 13:05 .part-00001-f80bdd04-3e33-42c9-a25d-428cfb027c16-c000.snappy.parquet.crc
-rw-r--r--. 1 andresmunozpampillon andresmunozpampillon 120799 ago 28 13:05 part-00002-f80bdd04-3e33-42c9-a25d-428cfb027c16-c000.snappy.parquet
-rw-r--r--. 1 andresmunozpampillon andresmunozpampillon    952 ago 28 13

## Create Bucketed Table

In [10]:
df.write.format("parquet").bucketBy(10,"pk").saveAsTable("bucketed_table")

                                                                                

In [14]:
%ls -la spark-warehouse/bucketed_table

total 1976
drwxr-xr-x. 1 andresmunozpampillon andresmunozpampillon 30242 ago 28 13:08 [0m[01;34m.[0m/
drwxr-xr-x. 1 andresmunozpampillon andresmunozpampillon    64 ago 28 13:07 [01;34m..[0m/
-rw-r--r--. 1 andresmunozpampillon andresmunozpampillon 12358 ago 28 13:07 part-00000-95abc15e-2929-45dc-af5c-321ed1d1a191_00000.c000.snappy.parquet
-rw-r--r--. 1 andresmunozpampillon andresmunozpampillon   108 ago 28 13:07 .part-00000-95abc15e-2929-45dc-af5c-321ed1d1a191_00000.c000.snappy.parquet.crc
-rw-r--r--. 1 andresmunozpampillon andresmunozpampillon 13375 ago 28 13:07 part-00000-95abc15e-2929-45dc-af5c-321ed1d1a191_00001.c000.snappy.parquet
-rw-r--r--. 1 andresmunozpampillon andresmunozpampillon   116 ago 28 13:07 .part-00000-95abc15e-2929-45dc-af5c-321ed1d1a191_00001.c000.snappy.parquet.crc
-rw-r--r--. 1 andresmunozpampillon andresmunozpampillon 11976 ago 28 13:07 part-00000-95abc15e-2929-45dc-af5c-321ed1d1a191_00002.c000.snappy.parquet
-rw-r--r--. 1 andresmunozpampillon andresmunozpam

## Bucketed and Non-Bucketed Tables

In [15]:
d1 = spark.table("non_bucketed_table")
d2 = spark.table("non_bucketed_table")

d3 = spark.table("bucketed_table")
d4 = spark.table("bucketed_table")

## Broadcast Join by default for files less than 10MB

In [16]:
d3.join(d4,"pk","inner").explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Project [pk#32L, attribute#33, attribute#39]
   +- BroadcastHashJoin [pk#32L], [pk#38L], Inner, BuildRight, false
      :- Filter isnotnull(pk#32L)
      :  +- FileScan parquet spark_catalog.default.bucketed_table[pk#32L,attribute#33] Batched: true, Bucketed: false (disabled by query planner), DataFilters: [isnotnull(pk#32L)], Format: Parquet, Location: InMemoryFileIndex(1 paths)[file:/home/andresmunozpampillon/spark-files/spark_da/spark-warehouse/b..., PartitionFilters: [], PushedFilters: [IsNotNull(pk)], ReadSchema: struct<pk:bigint,attribute:double>
      +- BroadcastExchange HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=137]
         +- Filter isnotnull(pk#38L)
            +- FileScan parquet spark_catalog.default.bucketed_table[pk#38L,attribute#39] Batched: true, Bucketed: false (disabled by query planner), DataFilters: [isnotnull(pk#38L)], Format: Parquet, Location: InMemoryFileIndex(1 paths)[

## Disable Broadcast Join

In [17]:
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)
spark.conf.set("spark.sql.adaptative.enable", False)

## Bucketed and Bucketed Join. No sides shuffle

In [18]:
d3.join(d4,"pk","inner").explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Project [pk#32L, attribute#33, attribute#46]
   +- SortMergeJoin [pk#32L], [pk#45L], Inner
      :- Sort [pk#32L ASC NULLS FIRST], false, 0
      :  +- Filter isnotnull(pk#32L)
      :     +- FileScan parquet spark_catalog.default.bucketed_table[pk#32L,attribute#33] Batched: true, Bucketed: true, DataFilters: [isnotnull(pk#32L)], Format: Parquet, Location: InMemoryFileIndex(1 paths)[file:/home/andresmunozpampillon/spark-files/spark_da/spark-warehouse/b..., PartitionFilters: [], PushedFilters: [IsNotNull(pk)], ReadSchema: struct<pk:bigint,attribute:double>, SelectedBucketsCount: 10 out of 10
      +- Sort [pk#45L ASC NULLS FIRST], false, 0
         +- Filter isnotnull(pk#45L)
            +- FileScan parquet spark_catalog.default.bucketed_table[pk#45L,attribute#46] Batched: true, Bucketed: true, DataFilters: [isnotnull(pk#45L)], Format: Parquet, Location: InMemoryFileIndex(1 paths)[file:/home/andresmunozpampillon/spark-files/spar

In [19]:
d3.join(d4,"pk","inner").show()

[Stage 6:>                                                          (0 + 1) / 1]

+---+-------------------+-------------------+
| pk|          attribute|          attribute|
+---+-------------------+-------------------+
|  4| 0.9476047869880925| 0.9476047869880925|
| 16|  0.524728096293865|  0.524728096293865|
| 18| 0.6232167713919952| 0.6232167713919952|
| 26|0.09865667253909105|0.09865667253909105|
| 27|0.06622344831941485|0.06622344831941485|
| 29|0.19412650105821194|0.19412650105821194|
| 61| 0.5352770539384141| 0.5352770539384141|
| 63| 0.5771966653256273| 0.5771966653256273|
| 64| 0.8092021738191977| 0.8092021738191977|
| 83|0.47415236401346694|0.47415236401346694|
| 92| 0.7195975867438303| 0.7195975867438303|
|100| 0.7546308964312055| 0.7546308964312055|
|115| 0.9087820885752338| 0.9087820885752338|
|125| 0.7086532984353837| 0.7086532984353837|
|130| 0.6026282217154758| 0.6026282217154758|
|140|0.22393800050758073|0.22393800050758073|
|153|0.20399627293507627|0.20399627293507627|
|158| 0.1493096806468921| 0.1493096806468921|
|163| 0.9832199149707848| 0.983219

                                                                                

## Non-Bucketed and Non-Bucketed Join. Both sides shuffle

In [21]:
d1.join(d2,"pk","inner").explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Project [pk#26L, attribute#27, attribute#71]
   +- SortMergeJoin [pk#26L], [pk#70L], Inner
      :- Sort [pk#26L ASC NULLS FIRST], false, 0
      :  +- Exchange hashpartitioning(pk#26L, 200), ENSURE_REQUIREMENTS, [plan_id=259]
      :     +- Filter isnotnull(pk#26L)
      :        +- FileScan parquet spark_catalog.default.non_bucketed_table[pk#26L,attribute#27] Batched: true, DataFilters: [isnotnull(pk#26L)], Format: Parquet, Location: InMemoryFileIndex(1 paths)[file:/home/andresmunozpampillon/spark-files/spark_da/spark-warehouse/n..., PartitionFilters: [], PushedFilters: [IsNotNull(pk)], ReadSchema: struct<pk:bigint,attribute:double>
      +- Sort [pk#70L ASC NULLS FIRST], false, 0
         +- Exchange hashpartitioning(pk#70L, 200), ENSURE_REQUIREMENTS, [plan_id=260]
            +- Filter isnotnull(pk#70L)
               +- FileScan parquet spark_catalog.default.non_bucketed_table[pk#70L,attribute#71] Batched: true, DataFilter

In [22]:
d1.join(d2,"pk","inner").show()

+---+-------------------+-------------------+
| pk|          attribute|          attribute|
+---+-------------------+-------------------+
|  1| 0.1709497137955568| 0.1709497137955568|
|  5|    0.2093704977577|    0.2093704977577|
|  6|0.36664222617947817|0.36664222617947817|
|  7| 0.8078688178371882| 0.8078688178371882|
|  9| 0.7195325566306053| 0.7195325566306053|
| 17| 0.9701253460019921| 0.9701253460019921|
| 19| 0.5089687568245219| 0.5089687568245219|
| 22| 0.6286498635045461| 0.6286498635045461|
| 25| 0.6822647942285944| 0.6822647942285944|
| 26|0.09865667253909105|0.09865667253909105|
| 27|0.06622344831941485|0.06622344831941485|
| 28| 0.5076232741953021| 0.5076232741953021|
| 29|0.19412650105821194|0.19412650105821194|
| 31| 0.4309638186000935| 0.4309638186000935|
| 32| 0.6606700902936103| 0.6606700902936103|
| 33| 0.9980781298103227| 0.9980781298103227|
| 34| 0.6993604281994343| 0.6993604281994343|
| 39| 0.9135015594663367| 0.9135015594663367|
| 41| 0.5601961495594829| 0.560196

## Non-Bucketed and Bucketed Join. One side shuffle

In [23]:
d1.join(d3,"pk","inner").explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Project [pk#26L, attribute#27, attribute#33]
   +- SortMergeJoin [pk#26L], [pk#32L], Inner
      :- Sort [pk#26L ASC NULLS FIRST], false, 0
      :  +- Exchange hashpartitioning(pk#26L, 10), ENSURE_REQUIREMENTS, [plan_id=416]
      :     +- Filter isnotnull(pk#26L)
      :        +- FileScan parquet spark_catalog.default.non_bucketed_table[pk#26L,attribute#27] Batched: true, DataFilters: [isnotnull(pk#26L)], Format: Parquet, Location: InMemoryFileIndex(1 paths)[file:/home/andresmunozpampillon/spark-files/spark_da/spark-warehouse/n..., PartitionFilters: [], PushedFilters: [IsNotNull(pk)], ReadSchema: struct<pk:bigint,attribute:double>
      +- Sort [pk#32L ASC NULLS FIRST], false, 0
         +- Filter isnotnull(pk#32L)
            +- FileScan parquet spark_catalog.default.bucketed_table[pk#32L,attribute#33] Batched: true, Bucketed: true, DataFilters: [isnotnull(pk#32L)], Format: Parquet, Location: InMemoryFileIndex(1 paths)[file

In [24]:
d1.join(d3,"pk","inner").show()

                                                                                

+---+-------------------+-------------------+
| pk|          attribute|          attribute|
+---+-------------------+-------------------+
|  4| 0.9476047869880925| 0.9476047869880925|
| 16|  0.524728096293865|  0.524728096293865|
| 18| 0.6232167713919952| 0.6232167713919952|
| 26|0.09865667253909105|0.09865667253909105|
| 27|0.06622344831941485|0.06622344831941485|
| 29|0.19412650105821194|0.19412650105821194|
| 61| 0.5352770539384141| 0.5352770539384141|
| 63| 0.5771966653256273| 0.5771966653256273|
| 64| 0.8092021738191977| 0.8092021738191977|
| 83|0.47415236401346694|0.47415236401346694|
| 92| 0.7195975867438303| 0.7195975867438303|
|100| 0.7546308964312055| 0.7546308964312055|
|115| 0.9087820885752338| 0.9087820885752338|
|125| 0.7086532984353837| 0.7086532984353837|
|130| 0.6026282217154758| 0.6026282217154758|
|140|0.22393800050758073|0.22393800050758073|
|153|0.20399627293507627|0.20399627293507627|
|158| 0.1493096806468921| 0.1493096806468921|
|163| 0.9832199149707848| 0.983219