In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Food Delivery Analytics").getOrCreate()

In [2]:
orders_data = [
("O001","North","Delhi","Rest-01","Pizza","2024-02-01",450,35),
("O002","North","Delhi","Rest-01","Burger","2024-02-01",250,25),
("O003","North","Chandigarh","Rest-02","Pasta","2024-02-02",350,30),
("O004","South","Bangalore","Rest-03","Pizza","2024-02-01",500,40),
("O005","South","Chennai","Rest-04","Burger","2024-02-02",220,20),
("O006","South","Bangalore","Rest-03","Pasta","2024-02-03",380,32),
("O007","East","Kolkata","Rest-05","Pizza","2024-02-01",420,38),
("O008","East","Kolkata","Rest-05","Burger","2024-02-02",260,26),
("O009","East","Patna","Rest-06","Pasta","2024-02-03",300,28),
("O010","West","Mumbai","Rest-07","Pizza","2024-02-01",520,42),
("O011","West","Mumbai","Rest-07","Burger","2024-02-02",280,27),
("O012","West","Pune","Rest-08","Pasta","2024-02-03",340,31),
("O013","North","Delhi","Rest-01","Pizza","2024-02-04",480,37),
("O014","South","Chennai","Rest-04","Pizza","2024-02-04",510,41),
("O015","East","Patna","Rest-06","Burger","2024-02-04",240,24),
("O016","West","Pune","Rest-08","Pizza","2024-02-04",500,39),
("O017","North","Chandigarh","Rest-02","Burger","2024-02-05",260,26),
("O018","South","Bangalore","Rest-03","Burger","2024-02-05",290,29),
("O019","East","Kolkata","Rest-05","Pasta","2024-02-05",360,33),
("O020","West","Mumbai","Rest-07","Pasta","2024-02-05",390,34),
("O021","North","Delhi","Rest-01","Pasta","2024-02-06",370,30),
("O022","South","Chennai","Rest-04","Pasta","2024-02-06",330,29),
("O023","East","Patna","Rest-06","Pizza","2024-02-06",460,36),
("O024","West","Pune","Rest-08","Burger","2024-02-06",270,26)
]

columns = [
"order_id","region","city","restaurant_id","food_item","order_date","amount","delivery_time_min"
]
df_orders = spark.createDataFrame(orders_data, columns)
df_orders.show(5)
df_orders.printSchema()

+--------+------+----------+-------------+---------+----------+------+-----------------+
|order_id|region|      city|restaurant_id|food_item|order_date|amount|delivery_time_min|
+--------+------+----------+-------------+---------+----------+------+-----------------+
|    O001| North|     Delhi|      Rest-01|    Pizza|2024-02-01|   450|               35|
|    O002| North|     Delhi|      Rest-01|   Burger|2024-02-01|   250|               25|
|    O003| North|Chandigarh|      Rest-02|    Pasta|2024-02-02|   350|               30|
|    O004| South| Bangalore|      Rest-03|    Pizza|2024-02-01|   500|               40|
|    O005| South|   Chennai|      Rest-04|   Burger|2024-02-02|   220|               20|
+--------+------+----------+-------------+---------+----------+------+-----------------+
only showing top 5 rows
root
 |-- order_id: string (nullable = true)
 |-- region: string (nullable = true)
 |-- city: string (nullable = true)
 |-- restaurant_id: string (nullable = true)
 |-- food_i

#EXERCISE SET 1 — SELECT OPERATIONS

1. Select only order_id , region , food_item , amount
2. Rename amount to order_value
3. Create a new column amount_in_hundreds
4. Select distinct combinations of region and food_item
5. Reorder columns in a logical reporting format
6. Create a column order_day extracted from order_date

In [3]:
from pyspark.sql.functions import *

In [4]:
#1
df_orders.select("order_id","region","food_item","amount").show()

+--------+------+---------+------+
|order_id|region|food_item|amount|
+--------+------+---------+------+
|    O001| North|    Pizza|   450|
|    O002| North|   Burger|   250|
|    O003| North|    Pasta|   350|
|    O004| South|    Pizza|   500|
|    O005| South|   Burger|   220|
|    O006| South|    Pasta|   380|
|    O007|  East|    Pizza|   420|
|    O008|  East|   Burger|   260|
|    O009|  East|    Pasta|   300|
|    O010|  West|    Pizza|   520|
|    O011|  West|   Burger|   280|
|    O012|  West|    Pasta|   340|
|    O013| North|    Pizza|   480|
|    O014| South|    Pizza|   510|
|    O015|  East|   Burger|   240|
|    O016|  West|    Pizza|   500|
|    O017| North|   Burger|   260|
|    O018| South|   Burger|   290|
|    O019|  East|    Pasta|   360|
|    O020|  West|    Pasta|   390|
+--------+------+---------+------+
only showing top 20 rows


In [5]:
#2
df_orders.select("order_id","region","food_item",col("amount").alias("order_value")).show()

+--------+------+---------+-----------+
|order_id|region|food_item|order_value|
+--------+------+---------+-----------+
|    O001| North|    Pizza|        450|
|    O002| North|   Burger|        250|
|    O003| North|    Pasta|        350|
|    O004| South|    Pizza|        500|
|    O005| South|   Burger|        220|
|    O006| South|    Pasta|        380|
|    O007|  East|    Pizza|        420|
|    O008|  East|   Burger|        260|
|    O009|  East|    Pasta|        300|
|    O010|  West|    Pizza|        520|
|    O011|  West|   Burger|        280|
|    O012|  West|    Pasta|        340|
|    O013| North|    Pizza|        480|
|    O014| South|    Pizza|        510|
|    O015|  East|   Burger|        240|
|    O016|  West|    Pizza|        500|
|    O017| North|   Burger|        260|
|    O018| South|   Burger|        290|
|    O019|  East|    Pasta|        360|
|    O020|  West|    Pasta|        390|
+--------+------+---------+-----------+
only showing top 20 rows


In [6]:
#3
df_orders.withColumn("amount_in_hundreds",col("amount")/100).show()

+--------+------+----------+-------------+---------+----------+------+-----------------+------------------+
|order_id|region|      city|restaurant_id|food_item|order_date|amount|delivery_time_min|amount_in_hundreds|
+--------+------+----------+-------------+---------+----------+------+-----------------+------------------+
|    O001| North|     Delhi|      Rest-01|    Pizza|2024-02-01|   450|               35|               4.5|
|    O002| North|     Delhi|      Rest-01|   Burger|2024-02-01|   250|               25|               2.5|
|    O003| North|Chandigarh|      Rest-02|    Pasta|2024-02-02|   350|               30|               3.5|
|    O004| South| Bangalore|      Rest-03|    Pizza|2024-02-01|   500|               40|               5.0|
|    O005| South|   Chennai|      Rest-04|   Burger|2024-02-02|   220|               20|               2.2|
|    O006| South| Bangalore|      Rest-03|    Pasta|2024-02-03|   380|               32|               3.8|
|    O007|  East|   Kolkata|

In [7]:
#4
df_orders.select("region","food_item").distinct().show()

+------+---------+
|region|food_item|
+------+---------+
|  West|   Burger|
|  East|    Pizza|
|  West|    Pizza|
| North|    Pizza|
| South|    Pizza|
|  East|   Burger|
| North|    Pasta|
|  East|    Pasta|
| North|   Burger|
| South|    Pasta|
| South|   Burger|
|  West|    Pasta|
+------+---------+



In [8]:
#5
df_orders.select("order_id","order_date","region","city","restaurant_id","food_item","amount","delivery_time_min").show()

+--------+----------+------+----------+-------------+---------+------+-----------------+
|order_id|order_date|region|      city|restaurant_id|food_item|amount|delivery_time_min|
+--------+----------+------+----------+-------------+---------+------+-----------------+
|    O001|2024-02-01| North|     Delhi|      Rest-01|    Pizza|   450|               35|
|    O002|2024-02-01| North|     Delhi|      Rest-01|   Burger|   250|               25|
|    O003|2024-02-02| North|Chandigarh|      Rest-02|    Pasta|   350|               30|
|    O004|2024-02-01| South| Bangalore|      Rest-03|    Pizza|   500|               40|
|    O005|2024-02-02| South|   Chennai|      Rest-04|   Burger|   220|               20|
|    O006|2024-02-03| South| Bangalore|      Rest-03|    Pasta|   380|               32|
|    O007|2024-02-01|  East|   Kolkata|      Rest-05|    Pizza|   420|               38|
|    O008|2024-02-02|  East|   Kolkata|      Rest-05|   Burger|   260|               26|
|    O009|2024-02-03|

In [12]:
#6
df_orders.withColumn("order_day",date_format(col("order_date"),"EEEE")).show()
#df_orders.withColumn("order_day",dayofweek(col("order_date"))).show()

+--------+------+----------+-------------+---------+----------+------+-----------------+---------+
|order_id|region|      city|restaurant_id|food_item|order_date|amount|delivery_time_min|order_day|
+--------+------+----------+-------------+---------+----------+------+-----------------+---------+
|    O001| North|     Delhi|      Rest-01|    Pizza|2024-02-01|   450|               35| Thursday|
|    O002| North|     Delhi|      Rest-01|   Burger|2024-02-01|   250|               25| Thursday|
|    O003| North|Chandigarh|      Rest-02|    Pasta|2024-02-02|   350|               30|   Friday|
|    O004| South| Bangalore|      Rest-03|    Pizza|2024-02-01|   500|               40| Thursday|
|    O005| South|   Chennai|      Rest-04|   Burger|2024-02-02|   220|               20|   Friday|
|    O006| South| Bangalore|      Rest-03|    Pasta|2024-02-03|   380|               32| Saturday|
|    O007|  East|   Kolkata|      Rest-05|    Pizza|2024-02-01|   420|               38| Thursday|
|    O008|

#EXERCISE SET 2 — FILTER OPERATIONS

1. Filter orders where amount > 400
2. Filter only Pizza orders
3. Filter orders from Delhi and Mumbai
4. Filter orders with delivery time greater than 35 minutes
5. Apply multiple conditions using AND and OR
6. Apply filters in different orders and compare explain(True)
7. Identify which filters are pushed down by Spark

In [13]:
#1
df_orders.filter(col("amount")>400).show()

+--------+------+---------+-------------+---------+----------+------+-----------------+
|order_id|region|     city|restaurant_id|food_item|order_date|amount|delivery_time_min|
+--------+------+---------+-------------+---------+----------+------+-----------------+
|    O001| North|    Delhi|      Rest-01|    Pizza|2024-02-01|   450|               35|
|    O004| South|Bangalore|      Rest-03|    Pizza|2024-02-01|   500|               40|
|    O007|  East|  Kolkata|      Rest-05|    Pizza|2024-02-01|   420|               38|
|    O010|  West|   Mumbai|      Rest-07|    Pizza|2024-02-01|   520|               42|
|    O013| North|    Delhi|      Rest-01|    Pizza|2024-02-04|   480|               37|
|    O014| South|  Chennai|      Rest-04|    Pizza|2024-02-04|   510|               41|
|    O016|  West|     Pune|      Rest-08|    Pizza|2024-02-04|   500|               39|
|    O023|  East|    Patna|      Rest-06|    Pizza|2024-02-06|   460|               36|
+--------+------+---------+-----

In [14]:
#2
df_orders.filter(col("food_item") == "Pizza").show()

+--------+------+---------+-------------+---------+----------+------+-----------------+
|order_id|region|     city|restaurant_id|food_item|order_date|amount|delivery_time_min|
+--------+------+---------+-------------+---------+----------+------+-----------------+
|    O001| North|    Delhi|      Rest-01|    Pizza|2024-02-01|   450|               35|
|    O004| South|Bangalore|      Rest-03|    Pizza|2024-02-01|   500|               40|
|    O007|  East|  Kolkata|      Rest-05|    Pizza|2024-02-01|   420|               38|
|    O010|  West|   Mumbai|      Rest-07|    Pizza|2024-02-01|   520|               42|
|    O013| North|    Delhi|      Rest-01|    Pizza|2024-02-04|   480|               37|
|    O014| South|  Chennai|      Rest-04|    Pizza|2024-02-04|   510|               41|
|    O016|  West|     Pune|      Rest-08|    Pizza|2024-02-04|   500|               39|
|    O023|  East|    Patna|      Rest-06|    Pizza|2024-02-06|   460|               36|
+--------+------+---------+-----

In [15]:
#3
#df_orders.filter((col("city") == "Delhi") | (col("city") == "Mumbai")).show()
df_orders.filter(col("city").isin("Delhi","Mumbai")).show()

+--------+------+------+-------------+---------+----------+------+-----------------+
|order_id|region|  city|restaurant_id|food_item|order_date|amount|delivery_time_min|
+--------+------+------+-------------+---------+----------+------+-----------------+
|    O001| North| Delhi|      Rest-01|    Pizza|2024-02-01|   450|               35|
|    O002| North| Delhi|      Rest-01|   Burger|2024-02-01|   250|               25|
|    O010|  West|Mumbai|      Rest-07|    Pizza|2024-02-01|   520|               42|
|    O011|  West|Mumbai|      Rest-07|   Burger|2024-02-02|   280|               27|
|    O013| North| Delhi|      Rest-01|    Pizza|2024-02-04|   480|               37|
|    O020|  West|Mumbai|      Rest-07|    Pasta|2024-02-05|   390|               34|
|    O021| North| Delhi|      Rest-01|    Pasta|2024-02-06|   370|               30|
+--------+------+------+-------------+---------+----------+------+-----------------+



In [16]:
#4
df_orders.filter(col("delivery_time_min")>35).show()

+--------+------+---------+-------------+---------+----------+------+-----------------+
|order_id|region|     city|restaurant_id|food_item|order_date|amount|delivery_time_min|
+--------+------+---------+-------------+---------+----------+------+-----------------+
|    O004| South|Bangalore|      Rest-03|    Pizza|2024-02-01|   500|               40|
|    O007|  East|  Kolkata|      Rest-05|    Pizza|2024-02-01|   420|               38|
|    O010|  West|   Mumbai|      Rest-07|    Pizza|2024-02-01|   520|               42|
|    O013| North|    Delhi|      Rest-01|    Pizza|2024-02-04|   480|               37|
|    O014| South|  Chennai|      Rest-04|    Pizza|2024-02-04|   510|               41|
|    O016|  West|     Pune|      Rest-08|    Pizza|2024-02-04|   500|               39|
|    O023|  East|    Patna|      Rest-06|    Pizza|2024-02-06|   460|               36|
+--------+------+---------+-------------+---------+----------+------+-----------------+



In [17]:
#5 AND
df_orders.filter(
    (col("amount")>400) & (col("delivery_time_min")>35)
).show()

+--------+------+---------+-------------+---------+----------+------+-----------------+
|order_id|region|     city|restaurant_id|food_item|order_date|amount|delivery_time_min|
+--------+------+---------+-------------+---------+----------+------+-----------------+
|    O004| South|Bangalore|      Rest-03|    Pizza|2024-02-01|   500|               40|
|    O007|  East|  Kolkata|      Rest-05|    Pizza|2024-02-01|   420|               38|
|    O010|  West|   Mumbai|      Rest-07|    Pizza|2024-02-01|   520|               42|
|    O013| North|    Delhi|      Rest-01|    Pizza|2024-02-04|   480|               37|
|    O014| South|  Chennai|      Rest-04|    Pizza|2024-02-04|   510|               41|
|    O016|  West|     Pune|      Rest-08|    Pizza|2024-02-04|   500|               39|
|    O023|  East|    Patna|      Rest-06|    Pizza|2024-02-06|   460|               36|
+--------+------+---------+-------------+---------+----------+------+-----------------+



In [18]:
df_orders.filter(
    (col("amount")>400) | (col("delivery_time_min")>35)).show()

+--------+------+---------+-------------+---------+----------+------+-----------------+
|order_id|region|     city|restaurant_id|food_item|order_date|amount|delivery_time_min|
+--------+------+---------+-------------+---------+----------+------+-----------------+
|    O001| North|    Delhi|      Rest-01|    Pizza|2024-02-01|   450|               35|
|    O004| South|Bangalore|      Rest-03|    Pizza|2024-02-01|   500|               40|
|    O007|  East|  Kolkata|      Rest-05|    Pizza|2024-02-01|   420|               38|
|    O010|  West|   Mumbai|      Rest-07|    Pizza|2024-02-01|   520|               42|
|    O013| North|    Delhi|      Rest-01|    Pizza|2024-02-04|   480|               37|
|    O014| South|  Chennai|      Rest-04|    Pizza|2024-02-04|   510|               41|
|    O016|  West|     Pune|      Rest-08|    Pizza|2024-02-04|   500|               39|
|    O023|  East|    Patna|      Rest-06|    Pizza|2024-02-06|   460|               36|
+--------+------+---------+-----

In [19]:
#6
df_orders.filter(col("amount")>400).filter(col("food_item") == "Pizza").explain(True)

== Parsed Logical Plan ==
'Filter '`=`('food_item, Pizza)
+- Filter (amount#6L > cast(400 as bigint))
   +- LogicalRDD [order_id#0, region#1, city#2, restaurant_id#3, food_item#4, order_date#5, amount#6L, delivery_time_min#7L], false

== Analyzed Logical Plan ==
order_id: string, region: string, city: string, restaurant_id: string, food_item: string, order_date: string, amount: bigint, delivery_time_min: bigint
Filter (food_item#4 = Pizza)
+- Filter (amount#6L > cast(400 as bigint))
   +- LogicalRDD [order_id#0, region#1, city#2, restaurant_id#3, food_item#4, order_date#5, amount#6L, delivery_time_min#7L], false

== Optimized Logical Plan ==
Filter ((isnotnull(amount#6L) AND isnotnull(food_item#4)) AND ((amount#6L > 400) AND (food_item#4 = Pizza)))
+- LogicalRDD [order_id#0, region#1, city#2, restaurant_id#3, food_item#4, order_date#5, amount#6L, delivery_time_min#7L], false

== Physical Plan ==
*(1) Filter ((isnotnull(amount#6L) AND isnotnull(food_item#4)) AND ((amount#6L > 400) AND (

In [20]:
#7
df_orders.filter(col("amount")>400).explain(True)

== Parsed Logical Plan ==
'Filter '`>`('amount, 400)
+- LogicalRDD [order_id#0, region#1, city#2, restaurant_id#3, food_item#4, order_date#5, amount#6L, delivery_time_min#7L], false

== Analyzed Logical Plan ==
order_id: string, region: string, city: string, restaurant_id: string, food_item: string, order_date: string, amount: bigint, delivery_time_min: bigint
Filter (amount#6L > cast(400 as bigint))
+- LogicalRDD [order_id#0, region#1, city#2, restaurant_id#3, food_item#4, order_date#5, amount#6L, delivery_time_min#7L], false

== Optimized Logical Plan ==
Filter (isnotnull(amount#6L) AND (amount#6L > 400))
+- LogicalRDD [order_id#0, region#1, city#2, restaurant_id#3, food_item#4, order_date#5, amount#6L, delivery_time_min#7L], false

== Physical Plan ==
*(1) Filter (isnotnull(amount#6L) AND (amount#6L > 400))
+- *(1) Scan ExistingRDD[order_id#0,region#1,city#2,restaurant_id#3,food_item#4,order_date#5,amount#6L,delivery_time_min#7L]



#EXERCISE SET 3 — TRANSFORMATIONS vs ACTIONS

1. Build a pipeline with:
select
filter
derived column
2. Do not call any action
3. Explain what Spark has done so far
4. Trigger count() and observe execution
5. Trigger show() and compare behavior

In [21]:
#1
df_pipeline = df_orders.select("region","food_item","amount").filter(col("amount")>300).withColumn("amount_taxed",col("amount")*1.1)
#2 no action

In [22]:
#3
df_pipeline.explain()

== Physical Plan ==
*(1) Project [region#1, food_item#4, amount#6L, (cast(amount#6L as double) * 1.1) AS amount_taxed#387]
+- *(1) Filter (isnotnull(amount#6L) AND (amount#6L > 300))
   +- *(1) Scan ExistingRDD[order_id#0,region#1,city#2,restaurant_id#3,food_item#4,order_date#5,amount#6L,delivery_time_min#7L]




In [23]:
#4
df_pipeline.count()

15

In [24]:
#5
df_pipeline.show()

+------+---------+------+------------------+
|region|food_item|amount|      amount_taxed|
+------+---------+------+------------------+
| North|    Pizza|   450|495.00000000000006|
| North|    Pasta|   350|385.00000000000006|
| South|    Pizza|   500|             550.0|
| South|    Pasta|   380|418.00000000000006|
|  East|    Pizza|   420|462.00000000000006|
|  West|    Pizza|   520|             572.0|
|  West|    Pasta|   340|374.00000000000006|
| North|    Pizza|   480|             528.0|
| South|    Pizza|   510|             561.0|
|  West|    Pizza|   500|             550.0|
|  East|    Pasta|   360|396.00000000000006|
|  West|    Pasta|   390|429.00000000000006|
| North|    Pasta|   370|407.00000000000006|
| South|    Pasta|   330|363.00000000000006|
|  East|    Pizza|   460|506.00000000000006|
+------+---------+------+------------------+



#EXERCISE SET 4 — PARTITIONS & FILE LAYOUT

1. Check the number of partitions of df_orders
2. Repartition the DataFrame into 4 partitions
3. Coalesce the DataFrame into 1 partition
4. Write repartitioned data to Parquet and count files
5. Write coalesced data to Parquet and count files
6. Explain why file counts differ

In [25]:
#1
df_orders.rdd.getNumPartitions()

2

In [27]:
#2
df_repart = df_orders.repartition(4)
df_repart.rdd.getNumPartitions()

4

In [28]:
#3
df_coalesce = df_orders.coalesce(1)
df_coalesce.rdd.getNumPartitions()

1

In [30]:
#4
df_repart.write.mode("overwrite").parquet("repart_orders")

In [32]:
#5
df_coalesce.write.mode("overwrite").parquet("orders_coalesce")

#EXERCISE SET 5 — GROUPBY & AGGREGATE FUNCTIONS

1. Total revenue per region
2. Average order amount per food item
3. Maximum order amount per city
4. Minimum delivery time per restaurant
5. Count number of orders per region
6. Total revenue per restaurant
7. Region + food item wise total revenue
8. City wise average delivery time
9. Identify regions with revenue above a threshold
10. Use explain(True) and identify shuffle operators

In [33]:
#1
df_orders.groupBy("region").agg(sum("amount").alias("total_revenue")).show()

+------+-------------+
|region|total_revenue|
+------+-------------+
| South|         2230|
|  East|         2040|
|  West|         2300|
| North|         2160|
+------+-------------+



In [34]:
#2
df_orders.groupBy("food_item").agg(avg("amount").alias("avg_amount")).show()

+---------+----------+
|food_item|avg_amount|
+---------+----------+
|   Burger|    258.75|
|    Pizza|     480.0|
|    Pasta|     352.5|
+---------+----------+



In [36]:
#3
df_orders.groupBy("city").agg(max("amount").alias("max_amount")).show()

+----------+----------+
|      city|max_amount|
+----------+----------+
| Bangalore|       500|
|     Patna|       460|
|   Chennai|       510|
|    Mumbai|       520|
|   Kolkata|       420|
|      Pune|       500|
|     Delhi|       480|
|Chandigarh|       350|
+----------+----------+



In [37]:
#4
df_orders.groupBy("restaurant_id").agg(min("delivery_time_min").alias("min_delivery_time")).show()

+-------------+-----------------+
|restaurant_id|min_delivery_time|
+-------------+-----------------+
|      Rest-01|               25|
|      Rest-06|               24|
|      Rest-04|               20|
|      Rest-03|               29|
|      Rest-02|               26|
|      Rest-08|               26|
|      Rest-07|               27|
|      Rest-05|               26|
+-------------+-----------------+



In [38]:
#5
df_orders.groupBy("region").agg(count("*").alias("order_count")).show()

+------+-----------+
|region|order_count|
+------+-----------+
| South|          6|
|  East|          6|
|  West|          6|
| North|          6|
+------+-----------+



In [39]:
#6
df_orders.groupBy("restaurant_id").agg(sum("amount").alias("total_revenue")).show()


+-------------+-------------+
|restaurant_id|total_revenue|
+-------------+-------------+
|      Rest-01|         1550|
|      Rest-06|         1000|
|      Rest-04|         1060|
|      Rest-03|         1170|
|      Rest-02|          610|
|      Rest-08|         1110|
|      Rest-07|         1190|
|      Rest-05|         1040|
+-------------+-------------+



In [40]:
#7
df_orders.groupBy("region","food_item").agg(sum("amount").alias("total_revenue")).show()


+------+---------+-------------+
|region|food_item|total_revenue|
+------+---------+-------------+
|  West|   Burger|          550|
|  East|    Pizza|          880|
|  West|    Pizza|         1020|
| North|    Pizza|          930|
| South|    Pizza|         1010|
|  East|   Burger|          500|
| North|    Pasta|          720|
|  East|    Pasta|          660|
| North|   Burger|          510|
| South|    Pasta|          710|
| South|   Burger|          510|
|  West|    Pasta|          730|
+------+---------+-------------+



In [41]:
#8
df_orders.groupBy("city").agg(avg("delivery_time_min").alias("avg_delivery_time")).show()

+----------+------------------+
|      city| avg_delivery_time|
+----------+------------------+
| Bangalore|33.666666666666664|
|     Patna|29.333333333333332|
|   Chennai|              30.0|
|    Mumbai|34.333333333333336|
|   Kolkata|32.333333333333336|
|      Pune|              32.0|
|     Delhi|             31.75|
|Chandigarh|              28.0|
+----------+------------------+



In [42]:
#9
df_orders.groupBy("region").agg(sum("amount").alias("total_revenue")).filter(col("total_revenue")>1000).show()

+------+-------------+
|region|total_revenue|
+------+-------------+
| South|         2230|
|  East|         2040|
|  West|         2300|
| North|         2160|
+------+-------------+



In [43]:
#10
df_orders.groupBy("region").agg(sum("amount").alias("total_revenue")).explain(True)

== Parsed Logical Plan ==
'Aggregate ['region], ['region, 'sum('amount) AS total_revenue#597]
+- LogicalRDD [order_id#0, region#1, city#2, restaurant_id#3, food_item#4, order_date#5, amount#6L, delivery_time_min#7L], false

== Analyzed Logical Plan ==
region: string, total_revenue: bigint
Aggregate [region#1], [region#1, sum(amount#6L) AS total_revenue#597L]
+- LogicalRDD [order_id#0, region#1, city#2, restaurant_id#3, food_item#4, order_date#5, amount#6L, delivery_time_min#7L], false

== Optimized Logical Plan ==
Aggregate [region#1], [region#1, sum(amount#6L) AS total_revenue#597L]
+- Project [region#1, amount#6L]
   +- LogicalRDD [order_id#0, region#1, city#2, restaurant_id#3, food_item#4, order_date#5, amount#6L, delivery_time_min#7L], false

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[region#1], functions=[sum(amount#6L)], output=[region#1, total_revenue#597L])
   +- Exchange hashpartitioning(region#1, 200), ENSURE_REQUIREMENTS, [plan_id=912]
   

#EXERCISE SET 6 — WINDOW FUNCTIONS (OVER)

1. Compute running total of revenue per region ordered by date
2. Rank orders by amount within each region
3. Assign row numbers per restaurant based on delivery time
4. Use dense rank to rank food items per region by revenue
5. Identify top 2 highest value orders per region
6. Compare rank , dense_rank , and row_number outputs
7. Calculate cumulative delivery time per restaurant

In [44]:
from pyspark.sql.window import Window

In [46]:
w_region_date = Window.partitionBy("region") \
                      .orderBy("order_date") \
                      .rowsBetween(Window.unboundedPreceding, Window.currentRow)

In [47]:
#1
df_orders.withColumn("running_revenue",sum("amount").over(w_region_date)).show()

+--------+------+----------+-------------+---------+----------+------+-----------------+---------------+
|order_id|region|      city|restaurant_id|food_item|order_date|amount|delivery_time_min|running_revenue|
+--------+------+----------+-------------+---------+----------+------+-----------------+---------------+
|    O007|  East|   Kolkata|      Rest-05|    Pizza|2024-02-01|   420|               38|            420|
|    O008|  East|   Kolkata|      Rest-05|   Burger|2024-02-02|   260|               26|            680|
|    O009|  East|     Patna|      Rest-06|    Pasta|2024-02-03|   300|               28|            980|
|    O015|  East|     Patna|      Rest-06|   Burger|2024-02-04|   240|               24|           1220|
|    O019|  East|   Kolkata|      Rest-05|    Pasta|2024-02-05|   360|               33|           1580|
|    O023|  East|     Patna|      Rest-06|    Pizza|2024-02-06|   460|               36|           2040|
|    O001| North|     Delhi|      Rest-01|    Pizza|202

In [48]:
#2
w = Window.partitionBy("region").orderBy(col("amount").desc())

df_orders.withColumn("rank", rank().over(w)).show()

+--------+------+----------+-------------+---------+----------+------+-----------------+----+
|order_id|region|      city|restaurant_id|food_item|order_date|amount|delivery_time_min|rank|
+--------+------+----------+-------------+---------+----------+------+-----------------+----+
|    O023|  East|     Patna|      Rest-06|    Pizza|2024-02-06|   460|               36|   1|
|    O007|  East|   Kolkata|      Rest-05|    Pizza|2024-02-01|   420|               38|   2|
|    O019|  East|   Kolkata|      Rest-05|    Pasta|2024-02-05|   360|               33|   3|
|    O009|  East|     Patna|      Rest-06|    Pasta|2024-02-03|   300|               28|   4|
|    O008|  East|   Kolkata|      Rest-05|   Burger|2024-02-02|   260|               26|   5|
|    O015|  East|     Patna|      Rest-06|   Burger|2024-02-04|   240|               24|   6|
|    O013| North|     Delhi|      Rest-01|    Pizza|2024-02-04|   480|               37|   1|
|    O001| North|     Delhi|      Rest-01|    Pizza|2024-02-

In [49]:
#3
w = Window.partitionBy("restaurant_id").orderBy("delivery_time_min")

df_orders.withColumn("row_num", row_number().over(w)).show()

+--------+------+----------+-------------+---------+----------+------+-----------------+-------+
|order_id|region|      city|restaurant_id|food_item|order_date|amount|delivery_time_min|row_num|
+--------+------+----------+-------------+---------+----------+------+-----------------+-------+
|    O002| North|     Delhi|      Rest-01|   Burger|2024-02-01|   250|               25|      1|
|    O021| North|     Delhi|      Rest-01|    Pasta|2024-02-06|   370|               30|      2|
|    O001| North|     Delhi|      Rest-01|    Pizza|2024-02-01|   450|               35|      3|
|    O013| North|     Delhi|      Rest-01|    Pizza|2024-02-04|   480|               37|      4|
|    O017| North|Chandigarh|      Rest-02|   Burger|2024-02-05|   260|               26|      1|
|    O003| North|Chandigarh|      Rest-02|    Pasta|2024-02-02|   350|               30|      2|
|    O018| South| Bangalore|      Rest-03|   Burger|2024-02-05|   290|               29|      1|
|    O006| South| Bangalore|  

In [50]:
#4
df_orders.withColumn("dense_rank", dense_rank().over(w)).show()

+--------+------+----------+-------------+---------+----------+------+-----------------+----------+
|order_id|region|      city|restaurant_id|food_item|order_date|amount|delivery_time_min|dense_rank|
+--------+------+----------+-------------+---------+----------+------+-----------------+----------+
|    O002| North|     Delhi|      Rest-01|   Burger|2024-02-01|   250|               25|         1|
|    O021| North|     Delhi|      Rest-01|    Pasta|2024-02-06|   370|               30|         2|
|    O001| North|     Delhi|      Rest-01|    Pizza|2024-02-01|   450|               35|         3|
|    O013| North|     Delhi|      Rest-01|    Pizza|2024-02-04|   480|               37|         4|
|    O017| North|Chandigarh|      Rest-02|   Burger|2024-02-05|   260|               26|         1|
|    O003| North|Chandigarh|      Rest-02|    Pasta|2024-02-02|   350|               30|         2|
|    O018| South| Bangalore|      Rest-03|   Burger|2024-02-05|   290|               29|         1|


In [51]:
#5
df_orders.withColumn("rank", rank().over(w)) \
         .filter(col("rank") <= 2).show()

+--------+------+----------+-------------+---------+----------+------+-----------------+----+
|order_id|region|      city|restaurant_id|food_item|order_date|amount|delivery_time_min|rank|
+--------+------+----------+-------------+---------+----------+------+-----------------+----+
|    O002| North|     Delhi|      Rest-01|   Burger|2024-02-01|   250|               25|   1|
|    O021| North|     Delhi|      Rest-01|    Pasta|2024-02-06|   370|               30|   2|
|    O017| North|Chandigarh|      Rest-02|   Burger|2024-02-05|   260|               26|   1|
|    O003| North|Chandigarh|      Rest-02|    Pasta|2024-02-02|   350|               30|   2|
|    O018| South| Bangalore|      Rest-03|   Burger|2024-02-05|   290|               29|   1|
|    O006| South| Bangalore|      Rest-03|    Pasta|2024-02-03|   380|               32|   2|
|    O005| South|   Chennai|      Rest-04|   Burger|2024-02-02|   220|               20|   1|
|    O022| South|   Chennai|      Rest-04|    Pasta|2024-02-

In [52]:
#6
df_orders.select(
    "region", "amount",
    rank().over(w).alias("rank"),
    dense_rank().over(w).alias("dense_rank"),
    row_number().over(w).alias("row_number")).show()

+------+------+----+----------+----------+
|region|amount|rank|dense_rank|row_number|
+------+------+----+----------+----------+
| North|   250|   1|         1|         1|
| North|   370|   2|         2|         2|
| North|   450|   3|         3|         3|
| North|   480|   4|         4|         4|
| North|   260|   1|         1|         1|
| North|   350|   2|         2|         2|
| South|   290|   1|         1|         1|
| South|   380|   2|         2|         2|
| South|   500|   3|         3|         3|
| South|   220|   1|         1|         1|
| South|   330|   2|         2|         2|
| South|   510|   3|         3|         3|
|  East|   260|   1|         1|         1|
|  East|   360|   2|         2|         2|
|  East|   420|   3|         3|         3|
|  East|   240|   1|         1|         1|
|  East|   300|   2|         2|         2|
|  East|   460|   3|         3|         3|
|  West|   280|   1|         1|         1|
|  West|   390|   2|         2|         2|
+------+---

In [53]:
#7
w = Window.partitionBy("restaurant_id") \
          .orderBy("order_date") \
          .rowsBetween(Window.unboundedPreceding, Window.currentRow)

df_orders.withColumn("cumulative_delivery_time",sum("delivery_time_min").over(w)).show()

+--------+------+----------+-------------+---------+----------+------+-----------------+------------------------+
|order_id|region|      city|restaurant_id|food_item|order_date|amount|delivery_time_min|cumulative_delivery_time|
+--------+------+----------+-------------+---------+----------+------+-----------------+------------------------+
|    O001| North|     Delhi|      Rest-01|    Pizza|2024-02-01|   450|               35|                      35|
|    O002| North|     Delhi|      Rest-01|   Burger|2024-02-01|   250|               25|                      60|
|    O013| North|     Delhi|      Rest-01|    Pizza|2024-02-04|   480|               37|                      97|
|    O021| North|     Delhi|      Rest-01|    Pasta|2024-02-06|   370|               30|                     127|
|    O003| North|Chandigarh|      Rest-02|    Pasta|2024-02-02|   350|               30|                      30|
|    O017| North|Chandigarh|      Rest-02|   Burger|2024-02-05|   260|               26|

#EXERCISE SET 7 — GROUPBY vs WINDOW (CONCEPTUAL)

1. Calculate total revenue per region using GroupBy
2. Calculate total revenue per region using Window
3. Compare:
Row count,
Output structure,
Use case
4. Explain why Window does not reduce rows

In [55]:
#1
df_orders.groupBy("region").agg(sum("amount").alias("total_revenue")).show()

+------+-------------+
|region|total_revenue|
+------+-------------+
| South|         2230|
|  East|         2040|
|  West|         2300|
| North|         2160|
+------+-------------+



In [56]:
#2
w = Window.partitionBy("region")
df_orders.withColumn("total_revenue", sum("amount").over(w)).show()

+--------+------+----------+-------------+---------+----------+------+-----------------+-------------+
|order_id|region|      city|restaurant_id|food_item|order_date|amount|delivery_time_min|total_revenue|
+--------+------+----------+-------------+---------+----------+------+-----------------+-------------+
|    O007|  East|   Kolkata|      Rest-05|    Pizza|2024-02-01|   420|               38|         2040|
|    O008|  East|   Kolkata|      Rest-05|   Burger|2024-02-02|   260|               26|         2040|
|    O009|  East|     Patna|      Rest-06|    Pasta|2024-02-03|   300|               28|         2040|
|    O015|  East|     Patna|      Rest-06|   Burger|2024-02-04|   240|               24|         2040|
|    O019|  East|   Kolkata|      Rest-05|    Pasta|2024-02-05|   360|               33|         2040|
|    O023|  East|     Patna|      Rest-06|    Pizza|2024-02-06|   460|               36|         2040|
|    O001| North|     Delhi|      Rest-01|    Pizza|2024-02-01|   450|   

#EXERCISE SET 8 — DAG & PERFORMANCE ANALYSIS

1. Run explain(True) for:
Simple select
Filter
GroupBy aggregation
Window function
2. Identify:
Exchange operators
Sort operations
Stage boundaries
3. Explain why window functions require sorting
4. Identify expensive operations in each DAG

In [57]:
#1
df_orders.select("region").explain(True)

== Parsed Logical Plan ==
'Project ['region]
+- LogicalRDD [order_id#0, region#1, city#2, restaurant_id#3, food_item#4, order_date#5, amount#6L, delivery_time_min#7L], false

== Analyzed Logical Plan ==
region: string
Project [region#1]
+- LogicalRDD [order_id#0, region#1, city#2, restaurant_id#3, food_item#4, order_date#5, amount#6L, delivery_time_min#7L], false

== Optimized Logical Plan ==
Project [region#1]
+- LogicalRDD [order_id#0, region#1, city#2, restaurant_id#3, food_item#4, order_date#5, amount#6L, delivery_time_min#7L], false

== Physical Plan ==
*(1) Project [region#1]
+- *(1) Scan ExistingRDD[order_id#0,region#1,city#2,restaurant_id#3,food_item#4,order_date#5,amount#6L,delivery_time_min#7L]



In [58]:
#2
df_orders.filter(col("amount") > 400).explain(True)

== Parsed Logical Plan ==
'Filter '`>`('amount, 400)
+- LogicalRDD [order_id#0, region#1, city#2, restaurant_id#3, food_item#4, order_date#5, amount#6L, delivery_time_min#7L], false

== Analyzed Logical Plan ==
order_id: string, region: string, city: string, restaurant_id: string, food_item: string, order_date: string, amount: bigint, delivery_time_min: bigint
Filter (amount#6L > cast(400 as bigint))
+- LogicalRDD [order_id#0, region#1, city#2, restaurant_id#3, food_item#4, order_date#5, amount#6L, delivery_time_min#7L], false

== Optimized Logical Plan ==
Filter (isnotnull(amount#6L) AND (amount#6L > 400))
+- LogicalRDD [order_id#0, region#1, city#2, restaurant_id#3, food_item#4, order_date#5, amount#6L, delivery_time_min#7L], false

== Physical Plan ==
*(1) Filter (isnotnull(amount#6L) AND (amount#6L > 400))
+- *(1) Scan ExistingRDD[order_id#0,region#1,city#2,restaurant_id#3,food_item#4,order_date#5,amount#6L,delivery_time_min#7L]



In [59]:
#3
df_orders.groupBy("region").sum("amount").explain(True)

== Parsed Logical Plan ==
'Aggregate ['region], ['region, unresolvedalias('sum(amount#6L))]
+- LogicalRDD [order_id#0, region#1, city#2, restaurant_id#3, food_item#4, order_date#5, amount#6L, delivery_time_min#7L], false

== Analyzed Logical Plan ==
region: string, sum(amount): bigint
Aggregate [region#1], [region#1, sum(amount#6L) AS sum(amount)#921L]
+- LogicalRDD [order_id#0, region#1, city#2, restaurant_id#3, food_item#4, order_date#5, amount#6L, delivery_time_min#7L], false

== Optimized Logical Plan ==
Aggregate [region#1], [region#1, sum(amount#6L) AS sum(amount)#921L]
+- Project [region#1, amount#6L]
   +- LogicalRDD [order_id#0, region#1, city#2, restaurant_id#3, food_item#4, order_date#5, amount#6L, delivery_time_min#7L], false

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[region#1], functions=[sum(amount#6L)], output=[region#1, sum(amount)#921L])
   +- Exchange hashpartitioning(region#1, 200), ENSURE_REQUIREMENTS, [plan_id=1511]
      +- Has

In [61]:
#4
from pyspark.sql.window import Window
w = Window.partitionBy("region").orderBy(col("amount").desc())
df_orders.withColumn("rank", rank().over(w)).explain(True)

== Parsed Logical Plan ==
'Project [unresolvedstarwithcolumns(rank, 'rank() windowspecdefinition('region, 'amount DESC NULLS LAST, unspecifiedframe$()), None)]
+- LogicalRDD [order_id#0, region#1, city#2, restaurant_id#3, food_item#4, order_date#5, amount#6L, delivery_time_min#7L], false

== Analyzed Logical Plan ==
order_id: string, region: string, city: string, restaurant_id: string, food_item: string, order_date: string, amount: bigint, delivery_time_min: bigint, rank: int
Project [order_id#0, region#1, city#2, restaurant_id#3, food_item#4, order_date#5, amount#6L, delivery_time_min#7L, rank#927]
+- Project [order_id#0, region#1, city#2, restaurant_id#3, food_item#4, order_date#5, amount#6L, delivery_time_min#7L, rank#927, rank#927]
   +- Window [rank(amount#6L) windowspecdefinition(region#1, amount#6L DESC NULLS LAST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS rank#927], [region#1], [amount#6L DESC NULLS LAST]
      +- Project [order_id#0, region#1, ci

#EXERCISE SET 9 — THINKING QUESTIONS

1. Why does GroupBy introduce shuffle?
2. Why does Window not reduce rows?
3. Why does repartition always cause shuffle?
4. Why is coalesce cheaper than repartition?
5. Why does Spark delay execution until an action?
6. When would you avoid window functions?