In [6]:
from pyspark.sql import SparkSession
spark = SparkSession.builder\
.appName('DAG and Broadcast demo')\
.getOrCreate()

In [7]:
orders_data = [
    ("O001","Hyderabad",1200),
    ("O002","Delhi",800),
    ("O003","Mumbai",1500),
    ("O004","Bangalore",400),
    ("O005","Hyderabad",300),
    ("O006","Delhi",2000),
    ("O007","Mumbai",700),
    ("O008","Bangalore",1800),
    ("O009","Delhi",350),
    ("O010","Hyderabad",900)
]
orders_cols = ["order_id","city","order_amount"]
orders_df = spark.createDataFrame(orders_data,orders_cols)
orders_df.show()

+--------+---------+------------+
|order_id|     city|order_amount|
+--------+---------+------------+
|    O001|Hyderabad|        1200|
|    O002|    Delhi|         800|
|    O003|   Mumbai|        1500|
|    O004|Bangalore|         400|
|    O005|Hyderabad|         300|
|    O006|    Delhi|        2000|
|    O007|   Mumbai|         700|
|    O008|Bangalore|        1800|
|    O009|    Delhi|         350|
|    O010|Hyderabad|         900|
+--------+---------+------------+



In [8]:
city_data = [
    ("Hyderabad","Tier 1"),
    ("Delhi","Tier 1"),
    ("Mumbai","Tier 1"),
    ("Bangalore","Tier 1")
]
city_cols = ["city","category"]
city_df = spark.createDataFrame(city_data,city_cols)
city_df.show()

+---------+--------+
|     city|category|
+---------+--------+
|Hyderabad|  Tier 1|
|    Delhi|  Tier 1|
|   Mumbai|  Tier 1|
|Bangalore|  Tier 1|
+---------+--------+



In [10]:
from pyspark.sql.functions import col
filtered_orders = orders_df.filter(col("order_amount") > 500)

In [11]:
joined_df = filtered_orders.join(
    city_df,
    on="city",
    how="inner"
)

In [13]:
final_df = joined_df.select(
    "order_id",
    "city",
    "category",
    "order_amount"
)

In [14]:
final_df.explain(True)

== Parsed Logical Plan ==
'Project ['order_id, 'city, 'category, 'order_amount]
+- Project [city#23, order_id#22, order_amount#24L, category#36]
   +- Join Inner, (city#23 = city#35)
      :- Filter (order_amount#24L > cast(500 as bigint))
      :  +- LogicalRDD [order_id#22, city#23, order_amount#24L], false
      +- LogicalRDD [city#35, category#36], false

== Analyzed Logical Plan ==
order_id: string, city: string, category: string, order_amount: bigint
Project [order_id#22, city#23, category#36, order_amount#24L]
+- Project [city#23, order_id#22, order_amount#24L, category#36]
   +- Join Inner, (city#23 = city#35)
      :- Filter (order_amount#24L > cast(500 as bigint))
      :  +- LogicalRDD [order_id#22, city#23, order_amount#24L], false
      +- LogicalRDD [city#35, category#36], false

== Optimized Logical Plan ==
Project [order_id#22, city#23, category#36, order_amount#24L]
+- Join Inner, (city#23 = city#35)
   :- Filter ((isnotnull(order_amount#24L) AND (order_amount#24L > 50

In [15]:
from pyspark.sql.functions import broadcast
broadcast_join_df = filtered_orders.join(
    broadcast(city_df),
    on="city",
    how="inner"
)

final_broadcast_df = broadcast_join_df.select(
    "order_id",
    "city",
    "category",
    "order_amount"
)

In [16]:
final_broadcast_df.explain(True)

== Parsed Logical Plan ==
'Project ['order_id, 'city, 'category, 'order_amount]
+- Project [city#23, order_id#22, order_amount#24L, category#36]
   +- Join Inner, (city#23 = city#35)
      :- Filter (order_amount#24L > cast(500 as bigint))
      :  +- LogicalRDD [order_id#22, city#23, order_amount#24L], false
      +- ResolvedHint (strategy=broadcast)
         +- LogicalRDD [city#35, category#36], false

== Analyzed Logical Plan ==
order_id: string, city: string, category: string, order_amount: bigint
Project [order_id#22, city#23, category#36, order_amount#24L]
+- Project [city#23, order_id#22, order_amount#24L, category#36]
   +- Join Inner, (city#23 = city#35)
      :- Filter (order_amount#24L > cast(500 as bigint))
      :  +- LogicalRDD [order_id#22, city#23, order_amount#24L], false
      +- ResolvedHint (strategy=broadcast)
         +- LogicalRDD [city#35, category#36], false

== Optimized Logical Plan ==
Project [order_id#22, city#23, category#36, order_amount#24L]
+- Join Inne

Broadcast Hash Join

Purpose: Optimizes Spark joins between a large DataFrame and a small DataFrame.

Mechanism: The small DataFrame is copied (broadcasted) to all worker nodes in the cluster.
Execution: Each worker node then performs a local hash join between its partitions of the large DataFrame and the broadcasted small DataFrame.

Benefit 1 (Shuffle): Eliminates costly shuffle operations on the large DataFrame, which would otherwise involve network I/O.

Benefit 2 (Speed): Leads to faster joins because local hash joins are quicker than shuffle-based joins.

Benefit 3 (Efficiency): Improves resource efficiency by reducing network traffic and disk I/O.

Indication: In Spark's physical plan (.explain(True)), it's identified as BroadcastHashJoin with a BroadcastExchange step.
