In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

spark = SparkSession.builder.appName("DAG and Broadcast Exercise").getOrCreate()

#Dataset 1

In [4]:
rides_data = [
("R001","U001","Hyderabad",12.5,240,"Completed"),
("R002","U002","Delhi",8.2,180,"Completed"),
("R003","U003","Mumbai",15.0,300,"Cancelled"),
("R004","U004","Bangalore",5.5,120,"Completed"),
("R005","U005","Hyderabad",20.0,360,"Completed"),
("R006","U006","Delhi",25.0,420,"Completed"),
("R007","U007","Mumbai",7.5,150,"Completed"),
("R008","U008","Bangalore",18.0,330,"Completed"),
("R009","U009","Delhi",6.0,140,"Cancelled"),
("R010","U010","Hyderabad",10.0,200,"Completed")
]
rides_cols = [
"ride_id","user_id",
"city","distance_km",
"duration_seconds","status"
]
rides_df = spark.createDataFrame(rides_data, rides_cols)
rides_df.show()

+-------+-------+---------+-----------+----------------+---------+
|ride_id|user_id|     city|distance_km|duration_seconds|   status|
+-------+-------+---------+-----------+----------------+---------+
|   R001|   U001|Hyderabad|       12.5|             240|Completed|
|   R002|   U002|    Delhi|        8.2|             180|Completed|
|   R003|   U003|   Mumbai|       15.0|             300|Cancelled|
|   R004|   U004|Bangalore|        5.5|             120|Completed|
|   R005|   U005|Hyderabad|       20.0|             360|Completed|
|   R006|   U006|    Delhi|       25.0|             420|Completed|
|   R007|   U007|   Mumbai|        7.5|             150|Completed|
|   R008|   U008|Bangalore|       18.0|             330|Completed|
|   R009|   U009|    Delhi|        6.0|             140|Cancelled|
|   R010|   U010|Hyderabad|       10.0|             200|Completed|
+-------+-------+---------+-----------+----------------+---------+



#Dataset 2

In [6]:
surge_data = [
("Hyderabad",1.2),
("Delhi",1.5),
("Mumbai",1.8),
("Bangalore",1.3)
]
surge_cols = ["city","surge_multiplier"]
surge_df = spark.createDataFrame(surge_data, surge_cols)
surge_df.show()

+---------+----------------+
|     city|surge_multiplier|
+---------+----------------+
|Hyderabad|             1.2|
|    Delhi|             1.5|
|   Mumbai|             1.8|
|Bangalore|             1.3|
+---------+----------------+



#EXERCISE SET 1 — TRANSFORMATIONS vs ACTIONS
###Exercise 1.1

Create a transformation pipeline that:

*   Filters only Completed rides
*   Selects ride_id , city , distance_km


In [7]:
pipeline_df = (
    rides_df
    .filter(col("status") == "Completed")
    .select("ride_id","city","distance_km")
)

###Exercise 1.2

Trigger a single action on the pipeline.
Tasks:


*   Identify which line caused execution
*   Explain why previous lines did not execute


In [8]:
pipeline_df.show()

+-------+---------+-----------+
|ride_id|     city|distance_km|
+-------+---------+-----------+
|   R001|Hyderabad|       12.5|
|   R002|    Delhi|        8.2|
|   R004|Bangalore|        5.5|
|   R005|Hyderabad|       20.0|
|   R006|    Delhi|       25.0|
|   R007|   Mumbai|        7.5|
|   R008|Bangalore|       18.0|
|   R010|Hyderabad|       10.0|
+-------+---------+-----------+



In [9]:
pipeline_df.explain(True)

== Parsed Logical Plan ==
'Project ['ride_id, 'city, 'distance_km]
+- Filter (status#5 = Completed)
   +- LogicalRDD [ride_id#0, user_id#1, city#2, distance_km#3, duration_seconds#4L, status#5], false

== Analyzed Logical Plan ==
ride_id: string, city: string, distance_km: double
Project [ride_id#0, city#2, distance_km#3]
+- Filter (status#5 = Completed)
   +- LogicalRDD [ride_id#0, user_id#1, city#2, distance_km#3, duration_seconds#4L, status#5], false

== Optimized Logical Plan ==
Project [ride_id#0, city#2, distance_km#3]
+- Filter (isnotnull(status#5) AND (status#5 = Completed))
   +- LogicalRDD [ride_id#0, user_id#1, city#2, distance_km#3, duration_seconds#4L, status#5], false

== Physical Plan ==
*(1) Project [ride_id#0, city#2, distance_km#3]
+- *(1) Filter (isnotnull(status#5) AND (status#5 = Completed))
   +- *(1) Scan ExistingRDD[ride_id#0,user_id#1,city#2,distance_km#3,duration_seconds#4L,status#5]



#EXERCISE SET 2 — DAG & LINEAGE

###Exercise 2.1

Create a transformation chain with:

*  Multiple filters
*  A column selection

Tasks:


*  Run explain(True)
*  Identify:
    *  Logical plan
    *  Optimized logical plan
    *  Physical plan


In [10]:
dag_df = (
    rides_df
    .filter(col("status") == "Completed")
    .filter(col("distance_km") > 10)
    .select("ride_id","city","distance_km")
)

dag_df.explain(True)

== Parsed Logical Plan ==
'Project ['ride_id, 'city, 'distance_km]
+- Filter (distance_km#3 > cast(10 as double))
   +- Filter (status#5 = Completed)
      +- LogicalRDD [ride_id#0, user_id#1, city#2, distance_km#3, duration_seconds#4L, status#5], false

== Analyzed Logical Plan ==
ride_id: string, city: string, distance_km: double
Project [ride_id#0, city#2, distance_km#3]
+- Filter (distance_km#3 > cast(10 as double))
   +- Filter (status#5 = Completed)
      +- LogicalRDD [ride_id#0, user_id#1, city#2, distance_km#3, duration_seconds#4L, status#5], false

== Optimized Logical Plan ==
Project [ride_id#0, city#2, distance_km#3]
+- Filter ((isnotnull(status#5) AND isnotnull(distance_km#3)) AND ((status#5 = Completed) AND (distance_km#3 > 10.0)))
   +- LogicalRDD [ride_id#0, user_id#1, city#2, distance_km#3, duration_seconds#4L, status#5], false

== Physical Plan ==
*(1) Project [ride_id#0, city#2, distance_km#3]
+- *(1) Filter ((isnotnull(status#5) AND isnotnull(distance_km#3)) AND ((s

#Exercise 2.2

Reorder transformations (filter after join vs before join).

Tasks:

*   Compare DAGs
*   Identify which plan is more efficient and why


In [11]:
bad_df = (
    rides_df
    .join(surge_df, "city")
    .filter(col("distance_km") > 10)
)

bad_df.explain(True)

== Parsed Logical Plan ==
'Filter '`>`('distance_km, 10)
+- Project [city#2, ride_id#0, user_id#1, distance_km#3, duration_seconds#4L, status#5, surge_multiplier#28]
   +- Join Inner, (city#2 = city#27)
      :- LogicalRDD [ride_id#0, user_id#1, city#2, distance_km#3, duration_seconds#4L, status#5], false
      +- LogicalRDD [city#27, surge_multiplier#28], false

== Analyzed Logical Plan ==
city: string, ride_id: string, user_id: string, distance_km: double, duration_seconds: bigint, status: string, surge_multiplier: double
Filter (distance_km#3 > cast(10 as double))
+- Project [city#2, ride_id#0, user_id#1, distance_km#3, duration_seconds#4L, status#5, surge_multiplier#28]
   +- Join Inner, (city#2 = city#27)
      :- LogicalRDD [ride_id#0, user_id#1, city#2, distance_km#3, duration_seconds#4L, status#5], false
      +- LogicalRDD [city#27, surge_multiplier#28], false

== Optimized Logical Plan ==
Project [city#2, ride_id#0, user_id#1, distance_km#3, duration_seconds#4L, status#5, sur

In [12]:
good_df = (
    rides_df
    .filter(col("distance_km") > 10)
    .join(surge_df, "city")
)

good_df.explain(True)

== Parsed Logical Plan ==
'Join UsingJoin(Inner, [city])
:- Filter (distance_km#3 > cast(10 as double))
:  +- LogicalRDD [ride_id#0, user_id#1, city#2, distance_km#3, duration_seconds#4L, status#5], false
+- LogicalRDD [city#27, surge_multiplier#28], false

== Analyzed Logical Plan ==
city: string, ride_id: string, user_id: string, distance_km: double, duration_seconds: bigint, status: string, surge_multiplier: double
Project [city#2, ride_id#0, user_id#1, distance_km#3, duration_seconds#4L, status#5, surge_multiplier#28]
+- Join Inner, (city#2 = city#27)
   :- Filter (distance_km#3 > cast(10 as double))
   :  +- LogicalRDD [ride_id#0, user_id#1, city#2, distance_km#3, duration_seconds#4L, status#5], false
   +- LogicalRDD [city#27, surge_multiplier#28], false

== Optimized Logical Plan ==
Project [city#2, ride_id#0, user_id#1, distance_km#3, duration_seconds#4L, status#5, surge_multiplier#28]
+- Join Inner, (city#2 = city#27)
   :- Filter ((isnotnull(distance_km#3) AND (distance_km#3 

#EXERCISE SET 3 — PARTITIONS & SHUFFLE

###Exercise 3.1

Check the number of partitions of rides_df .
Tasks:


*  Repartition into 4 partitions
*  Coalesce into 1 partition
*  Observe number of output files when writing to Parquet


In [13]:
rides_df.rdd.getNumPartitions()

2

In [14]:
repart_df = rides_df.repartition(4)
repart_df.rdd.getNumPartitions()

4

In [15]:
coal_df = rides_df.coalesce(1)
coal_df.rdd.getNumPartitions()

1

In [16]:
coal_df.write.mode("overwrite").parquet("/tmp/rides_single_file")

###Exercise 3.2

Repartition rides by city .

Tasks:

*  Run explain(True)
*  Identify whether a shuffle is introduced


In [17]:
city_part_df = rides_df.repartition("city")
city_part_df.explain(True)

== Parsed Logical Plan ==
'RepartitionByExpression ['city]
+- LogicalRDD [ride_id#0, user_id#1, city#2, distance_km#3, duration_seconds#4L, status#5], false

== Analyzed Logical Plan ==
ride_id: string, user_id: string, city: string, distance_km: double, duration_seconds: bigint, status: string
RepartitionByExpression [city#2]
+- LogicalRDD [ride_id#0, user_id#1, city#2, distance_km#3, duration_seconds#4L, status#5], false

== Optimized Logical Plan ==
RepartitionByExpression [city#2]
+- LogicalRDD [ride_id#0, user_id#1, city#2, distance_km#3, duration_seconds#4L, status#5], false

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Exchange hashpartitioning(city#2, 200), REPARTITION_BY_COL, [plan_id=183]
   +- Scan ExistingRDD[ride_id#0,user_id#1,city#2,distance_km#3,duration_seconds#4L,status#5]



#EXERCISE SET 4 — JOIN WITHOUT BROADCAST (BAD DAG)

Exercise 4.1

Join rides_df with surge_df on city without using broadcast.

Tasks:
*  Run explain(True)
*  Identify:
    *  Join type
    *  Exchange operators
    *  Sort operations
    *  Stage boundaries

In [18]:
join_df = rides_df.join(surge_df, "city")

join_df.explain(True)

== Parsed Logical Plan ==
'Join UsingJoin(Inner, [city])
:- LogicalRDD [ride_id#0, user_id#1, city#2, distance_km#3, duration_seconds#4L, status#5], false
+- LogicalRDD [city#27, surge_multiplier#28], false

== Analyzed Logical Plan ==
city: string, ride_id: string, user_id: string, distance_km: double, duration_seconds: bigint, status: string, surge_multiplier: double
Project [city#2, ride_id#0, user_id#1, distance_km#3, duration_seconds#4L, status#5, surge_multiplier#28]
+- Join Inner, (city#2 = city#27)
   :- LogicalRDD [ride_id#0, user_id#1, city#2, distance_km#3, duration_seconds#4L, status#5], false
   +- LogicalRDD [city#27, surge_multiplier#28], false

== Optimized Logical Plan ==
Project [city#2, ride_id#0, user_id#1, distance_km#3, duration_seconds#4L, status#5, surge_multiplier#28]
+- Join Inner, (city#2 = city#27)
   :- Filter isnotnull(city#2)
   :  +- LogicalRDD [ride_id#0, user_id#1, city#2, distance_km#3, duration_seconds#4L, status#5], false
   +- Filter isnotnull(city

###Exercise 4.2

Apply a filter ( distance_km > 10 ) before the join.

Tasks:

*  Observe whether shuffle is removed
*  Explain why or why not


In [19]:
filtered_join_df = (
    rides_df
    .filter(col("distance_km") > 10)
    .join(surge_df, "city")
)

filtered_join_df.explain(True)

== Parsed Logical Plan ==
'Join UsingJoin(Inner, [city])
:- Filter (distance_km#3 > cast(10 as double))
:  +- LogicalRDD [ride_id#0, user_id#1, city#2, distance_km#3, duration_seconds#4L, status#5], false
+- LogicalRDD [city#27, surge_multiplier#28], false

== Analyzed Logical Plan ==
city: string, ride_id: string, user_id: string, distance_km: double, duration_seconds: bigint, status: string, surge_multiplier: double
Project [city#2, ride_id#0, user_id#1, distance_km#3, duration_seconds#4L, status#5, surge_multiplier#28]
+- Join Inner, (city#2 = city#27)
   :- Filter (distance_km#3 > cast(10 as double))
   :  +- LogicalRDD [ride_id#0, user_id#1, city#2, distance_km#3, duration_seconds#4L, status#5], false
   +- LogicalRDD [city#27, surge_multiplier#28], false

== Optimized Logical Plan ==
Project [city#2, ride_id#0, user_id#1, distance_km#3, duration_seconds#4L, status#5, surge_multiplier#28]
+- Join Inner, (city#2 = city#27)
   :- Filter ((isnotnull(distance_km#3) AND (distance_km#3 

#EXERCISE SET 5 — BROADCAST JOIN (GOOD DAG)

###Exercise 5.1

Apply a broadcast hint to surge_df .

Tasks:
*  Run explain(True)
*  Identify:
    *  Join type
    *  BroadcastExchange
    *  Disappearance of shuffles

In [21]:
from pyspark.sql.functions import broadcast
broadcast_join_df = rides_df.join(
    broadcast(surge_df),
    "city"
)

broadcast_join_df.explain(True)

== Parsed Logical Plan ==
'Join UsingJoin(Inner, [city])
:- LogicalRDD [ride_id#0, user_id#1, city#2, distance_km#3, duration_seconds#4L, status#5], false
+- ResolvedHint (strategy=broadcast)
   +- LogicalRDD [city#27, surge_multiplier#28], false

== Analyzed Logical Plan ==
city: string, ride_id: string, user_id: string, distance_km: double, duration_seconds: bigint, status: string, surge_multiplier: double
Project [city#2, ride_id#0, user_id#1, distance_km#3, duration_seconds#4L, status#5, surge_multiplier#28]
+- Join Inner, (city#2 = city#27)
   :- LogicalRDD [ride_id#0, user_id#1, city#2, distance_km#3, duration_seconds#4L, status#5], false
   +- ResolvedHint (strategy=broadcast)
      +- LogicalRDD [city#27, surge_multiplier#28], false

== Optimized Logical Plan ==
Project [city#2, ride_id#0, user_id#1, distance_km#3, duration_seconds#4L, status#5, surge_multiplier#28]
+- Join Inner, (city#2 = city#27), rightHint=(strategy=broadcast)
   :- Filter isnotnull(city#2)
   :  +- Logical

###Exercise 5.2

Compare physical plans from:

*  Exercise 4.1
*  Exercise 5.1

Tasks:
*   List operators that disappeared
*   Explain performance impact


In [22]:
join_df.explain(True)          # From 4.1
broadcast_join_df.explain(True)

== Parsed Logical Plan ==
'Join UsingJoin(Inner, [city])
:- LogicalRDD [ride_id#0, user_id#1, city#2, distance_km#3, duration_seconds#4L, status#5], false
+- LogicalRDD [city#27, surge_multiplier#28], false

== Analyzed Logical Plan ==
city: string, ride_id: string, user_id: string, distance_km: double, duration_seconds: bigint, status: string, surge_multiplier: double
Project [city#2, ride_id#0, user_id#1, distance_km#3, duration_seconds#4L, status#5, surge_multiplier#28]
+- Join Inner, (city#2 = city#27)
   :- LogicalRDD [ride_id#0, user_id#1, city#2, distance_km#3, duration_seconds#4L, status#5], false
   +- LogicalRDD [city#27, surge_multiplier#28], false

== Optimized Logical Plan ==
Project [city#2, ride_id#0, user_id#1, distance_km#3, duration_seconds#4L, status#5, surge_multiplier#28]
+- Join Inner, (city#2 = city#27)
   :- Filter isnotnull(city#2)
   :  +- LogicalRDD [ride_id#0, user_id#1, city#2, distance_km#3, duration_seconds#4L, status#5], false
   +- Filter isnotnull(city

#EXERCISE SET 6 — DAG INTERPRETATION

###Exercise 6.1

From the physical plan:

*  Identify all expensive operators
*  Classify them as CPU, memory, or network heavy


###Exercise 6.2

Explain why Spark defaults to SortMergeJoin .

#EXERCISE SET 7 — ACTION-DRIVEN EXECUTION

#Exercise 7.1

Create a long transformation pipeline without any action.

Tasks:
*   Explain what Spark has done so far



In [23]:
long_df = (
    rides_df
    .filter(col("status") == "Completed")
    .filter(col("distance_km") > 5)
    .join(broadcast(surge_df), "city")
    .select("ride_id","city","distance_km","surge_multiplier")
)

#Exercise 7.2

Trigger different actions ( count , show , write ) separately.

Tasks:
*  Observe whether Spark recomputes the DAG
*  Explain behavior


In [24]:
long_df.count()
long_df.show()
long_df.write.mode("overwrite").parquet("/tmp/final_rides")

+-------+---------+-----------+----------------+
|ride_id|     city|distance_km|surge_multiplier|
+-------+---------+-----------+----------------+
|   R001|Hyderabad|       12.5|             1.2|
|   R002|    Delhi|        8.2|             1.5|
|   R004|Bangalore|        5.5|             1.3|
|   R005|Hyderabad|       20.0|             1.2|
|   R006|    Delhi|       25.0|             1.5|
|   R007|   Mumbai|        7.5|             1.8|
|   R008|Bangalore|       18.0|             1.3|
|   R010|Hyderabad|       10.0|             1.2|
+-------+---------+-----------+----------------+

