In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum

spark = SparkSession.builder.appName("Retail Sale Analysis").getOrCreate()

In [2]:
sales_data = [
("T001","North","Delhi","Store-01","Laptop","2024-01-01",75000),
("T002","North","Delhi","Store-01","Mobile","2024-01-02",32000),
("T003","North","Chandigarh","Store-02","Tablet","2024-01-03",26000),
("T004","South","Bangalore","Store-03","Laptop","2024-01-01",78000),
("T005","South","Chennai","Store-04","Mobile","2024-01-02",30000),
("T006","South","Bangalore","Store-03","Tablet","2024-01-03",24000),
("T007","East","Kolkata","Store-05","Laptop","2024-01-01",72000),
("T008","East","Kolkata","Store-05","Mobile","2024-01-02",28000),
("T009","East","Patna","Store-06","Tablet","2024-01-03",23000),
("T010","West","Mumbai","Store-07","Laptop","2024-01-01",80000),
("T011","West","Mumbai","Store-07","Mobile","2024-01-02",35000),
("T012","West","Pune","Store-08","Tablet","2024-01-03",27000),
("T013","North","Delhi","Store-01","Laptop","2024-01-04",76000),
("T014","South","Chennai","Store-04","Laptop","2024-01-04",79000),
("T015","East","Patna","Store-06","Mobile","2024-01-04",29000),
("T016","West","Pune","Store-08","Laptop","2024-01-04",77000),
("T017","North","Chandigarh","Store-02","Mobile","2024-01-05",31000),
("T018","South","Bangalore","Store-03","Mobile","2024-01-05",34000),
("T019","East","Kolkata","Store-05","Tablet","2024-01-05",25000),
("T020","West","Mumbai","Store-07","Tablet","2024-01-05",29000),
("T021","North","Delhi","Store-01","Tablet","2024-01-06",28000),
("T022","South","Chennai","Store-04","Tablet","2024-01-06",26000),
("T023","East","Patna","Store-06","Laptop","2024-01-06",74000),
("T024","West","Pune","Store-08","Mobile","2024-01-06",33000)
]
columns = [
"txn_id","region","city","store_id","product","sale_date","amount"
]
df_sales = spark.createDataFrame(sales_data, columns)
df_sales.show(5)
df_sales.printSchema()

+------+------+----------+--------+-------+----------+------+
|txn_id|region|      city|store_id|product| sale_date|amount|
+------+------+----------+--------+-------+----------+------+
|  T001| North|     Delhi|Store-01| Laptop|2024-01-01| 75000|
|  T002| North|     Delhi|Store-01| Mobile|2024-01-02| 32000|
|  T003| North|Chandigarh|Store-02| Tablet|2024-01-03| 26000|
|  T004| South| Bangalore|Store-03| Laptop|2024-01-01| 78000|
|  T005| South|   Chennai|Store-04| Mobile|2024-01-02| 30000|
+------+------+----------+--------+-------+----------+------+
only showing top 5 rows
root
 |-- txn_id: string (nullable = true)
 |-- region: string (nullable = true)
 |-- city: string (nullable = true)
 |-- store_id: string (nullable = true)
 |-- product: string (nullable = true)
 |-- sale_date: string (nullable = true)
 |-- amount: long (nullable = true)



#EXERCISE SET 1 — SELECT OPERATIONS

Exercises
1. Select only txn_id , region , product , and amount
2. Rename amount to revenue
3. Create a derived column amount_in_thousands
4. Select distinct combinations of region and product
5. Select all columns but exclude store_id
6. Create a new column sale_year extracted from sale_date
7. Reorder columns in a business-friendly format

In [3]:
#1
df_sales.select("txn_id","region","product","amount").show(5)


+------+------+-------+------+
|txn_id|region|product|amount|
+------+------+-------+------+
|  T001| North| Laptop| 75000|
|  T002| North| Mobile| 32000|
|  T003| North| Tablet| 26000|
|  T004| South| Laptop| 78000|
|  T005| South| Mobile| 30000|
+------+------+-------+------+
only showing top 5 rows


In [4]:
#2
df_sales.select("txn_id","region","product",col("amount").alias("revenue")).show(5)


+------+------+-------+-------+
|txn_id|region|product|revenue|
+------+------+-------+-------+
|  T001| North| Laptop|  75000|
|  T002| North| Mobile|  32000|
|  T003| North| Tablet|  26000|
|  T004| South| Laptop|  78000|
|  T005| South| Mobile|  30000|
+------+------+-------+-------+
only showing top 5 rows


In [5]:
#3
df_sales.withColumn("amount_in_thousands",col("amount")/1000).show()

+------+------+----------+--------+-------+----------+------+-------------------+
|txn_id|region|      city|store_id|product| sale_date|amount|amount_in_thousands|
+------+------+----------+--------+-------+----------+------+-------------------+
|  T001| North|     Delhi|Store-01| Laptop|2024-01-01| 75000|               75.0|
|  T002| North|     Delhi|Store-01| Mobile|2024-01-02| 32000|               32.0|
|  T003| North|Chandigarh|Store-02| Tablet|2024-01-03| 26000|               26.0|
|  T004| South| Bangalore|Store-03| Laptop|2024-01-01| 78000|               78.0|
|  T005| South|   Chennai|Store-04| Mobile|2024-01-02| 30000|               30.0|
|  T006| South| Bangalore|Store-03| Tablet|2024-01-03| 24000|               24.0|
|  T007|  East|   Kolkata|Store-05| Laptop|2024-01-01| 72000|               72.0|
|  T008|  East|   Kolkata|Store-05| Mobile|2024-01-02| 28000|               28.0|
|  T009|  East|     Patna|Store-06| Tablet|2024-01-03| 23000|               23.0|
|  T010|  West| 

In [6]:
#4
df_sales.select("region","product").distinct().show()

+------+-------+
|region|product|
+------+-------+
| North| Laptop|
| North| Tablet|
|  East| Tablet|
|  East| Laptop|
| South| Tablet|
| North| Mobile|
|  West| Tablet|
|  East| Mobile|
| South| Mobile|
| South| Laptop|
|  West| Mobile|
|  West| Laptop|
+------+-------+



In [7]:
#5
df_sales.select([c for c in df_sales.columns if c != "store_id"]).show()

+------+------+----------+-------+----------+------+
|txn_id|region|      city|product| sale_date|amount|
+------+------+----------+-------+----------+------+
|  T001| North|     Delhi| Laptop|2024-01-01| 75000|
|  T002| North|     Delhi| Mobile|2024-01-02| 32000|
|  T003| North|Chandigarh| Tablet|2024-01-03| 26000|
|  T004| South| Bangalore| Laptop|2024-01-01| 78000|
|  T005| South|   Chennai| Mobile|2024-01-02| 30000|
|  T006| South| Bangalore| Tablet|2024-01-03| 24000|
|  T007|  East|   Kolkata| Laptop|2024-01-01| 72000|
|  T008|  East|   Kolkata| Mobile|2024-01-02| 28000|
|  T009|  East|     Patna| Tablet|2024-01-03| 23000|
|  T010|  West|    Mumbai| Laptop|2024-01-01| 80000|
|  T011|  West|    Mumbai| Mobile|2024-01-02| 35000|
|  T012|  West|      Pune| Tablet|2024-01-03| 27000|
|  T013| North|     Delhi| Laptop|2024-01-04| 76000|
|  T014| South|   Chennai| Laptop|2024-01-04| 79000|
|  T015|  East|     Patna| Mobile|2024-01-04| 29000|
|  T016|  West|      Pune| Laptop|2024-01-04| 

In [8]:
#6
from pyspark.sql.functions import year
df_sales.withColumn("sale_year",year("sale_date")).show()

+------+------+----------+--------+-------+----------+------+---------+
|txn_id|region|      city|store_id|product| sale_date|amount|sale_year|
+------+------+----------+--------+-------+----------+------+---------+
|  T001| North|     Delhi|Store-01| Laptop|2024-01-01| 75000|     2024|
|  T002| North|     Delhi|Store-01| Mobile|2024-01-02| 32000|     2024|
|  T003| North|Chandigarh|Store-02| Tablet|2024-01-03| 26000|     2024|
|  T004| South| Bangalore|Store-03| Laptop|2024-01-01| 78000|     2024|
|  T005| South|   Chennai|Store-04| Mobile|2024-01-02| 30000|     2024|
|  T006| South| Bangalore|Store-03| Tablet|2024-01-03| 24000|     2024|
|  T007|  East|   Kolkata|Store-05| Laptop|2024-01-01| 72000|     2024|
|  T008|  East|   Kolkata|Store-05| Mobile|2024-01-02| 28000|     2024|
|  T009|  East|     Patna|Store-06| Tablet|2024-01-03| 23000|     2024|
|  T010|  West|    Mumbai|Store-07| Laptop|2024-01-01| 80000|     2024|
|  T011|  West|    Mumbai|Store-07| Mobile|2024-01-02| 35000|   

In [9]:
#7
df_sales.select("txn_id","sale_date","region","city","product","amount","store_id").show(5)


+------+----------+------+----------+-------+------+--------+
|txn_id| sale_date|region|      city|product|amount|store_id|
+------+----------+------+----------+-------+------+--------+
|  T001|2024-01-01| North|     Delhi| Laptop| 75000|Store-01|
|  T002|2024-01-02| North|     Delhi| Mobile| 32000|Store-01|
|  T003|2024-01-03| North|Chandigarh| Tablet| 26000|Store-02|
|  T004|2024-01-01| South| Bangalore| Laptop| 78000|Store-03|
|  T005|2024-01-02| South|   Chennai| Mobile| 30000|Store-04|
+------+----------+------+----------+-------+------+--------+
only showing top 5 rows


#EXERCISE SET 2 — FILTER OPERATIONS

Exercises
1. Filter transactions where amount > 50000
2. Filter only Laptop sales
3. Filter sales from North and South regions
4. Filter sales between 25000 and 75000
5. Filter transactions from Delhi stores only
6. Apply multiple filters using both filter and where
7. Change the order of filters and compare explain(True)
8. Identify which filters Spark pushes down

In [10]:
#1
df_sales.filter(col("amount")>50000).show()

+------+------+---------+--------+-------+----------+------+
|txn_id|region|     city|store_id|product| sale_date|amount|
+------+------+---------+--------+-------+----------+------+
|  T001| North|    Delhi|Store-01| Laptop|2024-01-01| 75000|
|  T004| South|Bangalore|Store-03| Laptop|2024-01-01| 78000|
|  T007|  East|  Kolkata|Store-05| Laptop|2024-01-01| 72000|
|  T010|  West|   Mumbai|Store-07| Laptop|2024-01-01| 80000|
|  T013| North|    Delhi|Store-01| Laptop|2024-01-04| 76000|
|  T014| South|  Chennai|Store-04| Laptop|2024-01-04| 79000|
|  T016|  West|     Pune|Store-08| Laptop|2024-01-04| 77000|
|  T023|  East|    Patna|Store-06| Laptop|2024-01-06| 74000|
+------+------+---------+--------+-------+----------+------+



In [11]:
#2
df_sales.filter(col("product")=="Laptop").show()

+------+------+---------+--------+-------+----------+------+
|txn_id|region|     city|store_id|product| sale_date|amount|
+------+------+---------+--------+-------+----------+------+
|  T001| North|    Delhi|Store-01| Laptop|2024-01-01| 75000|
|  T004| South|Bangalore|Store-03| Laptop|2024-01-01| 78000|
|  T007|  East|  Kolkata|Store-05| Laptop|2024-01-01| 72000|
|  T010|  West|   Mumbai|Store-07| Laptop|2024-01-01| 80000|
|  T013| North|    Delhi|Store-01| Laptop|2024-01-04| 76000|
|  T014| South|  Chennai|Store-04| Laptop|2024-01-04| 79000|
|  T016|  West|     Pune|Store-08| Laptop|2024-01-04| 77000|
|  T023|  East|    Patna|Store-06| Laptop|2024-01-06| 74000|
+------+------+---------+--------+-------+----------+------+



In [12]:
#3
df_sales.filter(col("region").isin(["North","South"])).show()


+------+------+----------+--------+-------+----------+------+
|txn_id|region|      city|store_id|product| sale_date|amount|
+------+------+----------+--------+-------+----------+------+
|  T001| North|     Delhi|Store-01| Laptop|2024-01-01| 75000|
|  T002| North|     Delhi|Store-01| Mobile|2024-01-02| 32000|
|  T003| North|Chandigarh|Store-02| Tablet|2024-01-03| 26000|
|  T004| South| Bangalore|Store-03| Laptop|2024-01-01| 78000|
|  T005| South|   Chennai|Store-04| Mobile|2024-01-02| 30000|
|  T006| South| Bangalore|Store-03| Tablet|2024-01-03| 24000|
|  T013| North|     Delhi|Store-01| Laptop|2024-01-04| 76000|
|  T014| South|   Chennai|Store-04| Laptop|2024-01-04| 79000|
|  T017| North|Chandigarh|Store-02| Mobile|2024-01-05| 31000|
|  T018| South| Bangalore|Store-03| Mobile|2024-01-05| 34000|
|  T021| North|     Delhi|Store-01| Tablet|2024-01-06| 28000|
|  T022| South|   Chennai|Store-04| Tablet|2024-01-06| 26000|
+------+------+----------+--------+-------+----------+------+



In [13]:
#4
df_sales.filter((col("amount").between(25000,75000))).show()

+------+------+----------+--------+-------+----------+------+
|txn_id|region|      city|store_id|product| sale_date|amount|
+------+------+----------+--------+-------+----------+------+
|  T001| North|     Delhi|Store-01| Laptop|2024-01-01| 75000|
|  T002| North|     Delhi|Store-01| Mobile|2024-01-02| 32000|
|  T003| North|Chandigarh|Store-02| Tablet|2024-01-03| 26000|
|  T005| South|   Chennai|Store-04| Mobile|2024-01-02| 30000|
|  T007|  East|   Kolkata|Store-05| Laptop|2024-01-01| 72000|
|  T008|  East|   Kolkata|Store-05| Mobile|2024-01-02| 28000|
|  T011|  West|    Mumbai|Store-07| Mobile|2024-01-02| 35000|
|  T012|  West|      Pune|Store-08| Tablet|2024-01-03| 27000|
|  T015|  East|     Patna|Store-06| Mobile|2024-01-04| 29000|
|  T017| North|Chandigarh|Store-02| Mobile|2024-01-05| 31000|
|  T018| South| Bangalore|Store-03| Mobile|2024-01-05| 34000|
|  T019|  East|   Kolkata|Store-05| Tablet|2024-01-05| 25000|
|  T020|  West|    Mumbai|Store-07| Tablet|2024-01-05| 29000|
|  T021|

In [14]:
#5
df_sales.filter(col("city")=="Delhi").show()

+------+------+-----+--------+-------+----------+------+
|txn_id|region| city|store_id|product| sale_date|amount|
+------+------+-----+--------+-------+----------+------+
|  T001| North|Delhi|Store-01| Laptop|2024-01-01| 75000|
|  T002| North|Delhi|Store-01| Mobile|2024-01-02| 32000|
|  T013| North|Delhi|Store-01| Laptop|2024-01-04| 76000|
|  T021| North|Delhi|Store-01| Tablet|2024-01-06| 28000|
+------+------+-----+--------+-------+----------+------+



In [16]:
#6
df_sales.filter(col("region")=="North").where (col("product")=="Laptop").show()

+------+------+-----+--------+-------+----------+------+
|txn_id|region| city|store_id|product| sale_date|amount|
+------+------+-----+--------+-------+----------+------+
|  T001| North|Delhi|Store-01| Laptop|2024-01-01| 75000|
|  T013| North|Delhi|Store-01| Laptop|2024-01-04| 76000|
+------+------+-----+--------+-------+----------+------+



In [17]:
#7
df_sales.filter(col("amount") >30000)\
.filter(col("product")=="Laptop")\
.explain(True)
#

== Parsed Logical Plan ==
'Filter '`=`('product, Laptop)
+- Filter (amount#6L > cast(30000 as bigint))
   +- LogicalRDD [txn_id#0, region#1, city#2, store_id#3, product#4, sale_date#5, amount#6L], false

== Analyzed Logical Plan ==
txn_id: string, region: string, city: string, store_id: string, product: string, sale_date: string, amount: bigint
Filter (product#4 = Laptop)
+- Filter (amount#6L > cast(30000 as bigint))
   +- LogicalRDD [txn_id#0, region#1, city#2, store_id#3, product#4, sale_date#5, amount#6L], false

== Optimized Logical Plan ==
Filter ((isnotnull(amount#6L) AND isnotnull(product#4)) AND ((amount#6L > 30000) AND (product#4 = Laptop)))
+- LogicalRDD [txn_id#0, region#1, city#2, store_id#3, product#4, sale_date#5, amount#6L], false

== Physical Plan ==
*(1) Filter ((isnotnull(amount#6L) AND isnotnull(product#4)) AND ((amount#6L > 30000) AND (product#4 = Laptop)))
+- *(1) Scan ExistingRDD[txn_id#0,region#1,city#2,store_id#3,product#4,sale_date#5,amount#6L]



In [18]:
#8
df_sales.filter(col("amount") >30000)\
.explain(True)

== Parsed Logical Plan ==
'Filter '`>`('amount, 30000)
+- LogicalRDD [txn_id#0, region#1, city#2, store_id#3, product#4, sale_date#5, amount#6L], false

== Analyzed Logical Plan ==
txn_id: string, region: string, city: string, store_id: string, product: string, sale_date: string, amount: bigint
Filter (amount#6L > cast(30000 as bigint))
+- LogicalRDD [txn_id#0, region#1, city#2, store_id#3, product#4, sale_date#5, amount#6L], false

== Optimized Logical Plan ==
Filter (isnotnull(amount#6L) AND (amount#6L > 30000))
+- LogicalRDD [txn_id#0, region#1, city#2, store_id#3, product#4, sale_date#5, amount#6L], false

== Physical Plan ==
*(1) Filter (isnotnull(amount#6L) AND (amount#6L > 30000))
+- *(1) Scan ExistingRDD[txn_id#0,region#1,city#2,store_id#3,product#4,sale_date#5,amount#6L]



#EXERCISE SET 3 — GROUPBY & AGGREGATE FUNCTIONS

Exercises
1. Total sales amount per region
2. Average sales amount per product
3. Maximum sale per city
4. Minimum sale per store
5. Count of transactions per region
6. Total revenue per store
7. Region-wise product sales count
8. Average transaction value per city
9. Identify regions with total sales above a threshold
10. Use explain(True) and identify shuffle stages

In [21]:
from pyspark.sql.functions import sum,avg,max,min,count
#1
df_sales.groupBy("region").agg(sum("amount").alias("total_sales")).show()

+------+-----------+
|region|total_sales|
+------+-----------+
| South|     271000|
|  East|     251000|
|  West|     281000|
| North|     268000|
+------+-----------+



In [22]:
#2
df_sales.groupBy("product").agg(avg("amount").alias("avg_sales")).show()

+-------+---------+
|product|avg_sales|
+-------+---------+
| Laptop|  76375.0|
| Mobile|  31500.0|
| Tablet|  26000.0|
+-------+---------+



In [23]:
#3
df_sales.groupBy("city").agg(max("amount").alias("max_sales")).show()

+----------+---------+
|      city|max_sales|
+----------+---------+
| Bangalore|    78000|
|     Patna|    74000|
|   Chennai|    79000|
|    Mumbai|    80000|
|   Kolkata|    72000|
|      Pune|    77000|
|     Delhi|    76000|
|Chandigarh|    31000|
+----------+---------+



In [25]:
#4
df_sales.groupBy("store_id").agg(min("amount").alias("min_sales")).show()

+--------+---------+
|store_id|min_sales|
+--------+---------+
|Store-05|    25000|
|Store-06|    23000|
|Store-03|    24000|
|Store-01|    28000|
|Store-04|    26000|
|Store-07|    29000|
|Store-08|    27000|
|Store-02|    26000|
+--------+---------+



In [26]:
#5
df_sales.groupBy("region").agg(count("*").alias("transaction_count")).show()

+------+-----------------+
|region|transaction_count|
+------+-----------------+
| South|                6|
|  East|                6|
|  West|                6|
| North|                6|
+------+-----------------+



In [27]:
#6
df_sales.groupBy("store_id").agg(sum("amount").alias("total_revenue")).show()

+--------+-------------+
|store_id|total_revenue|
+--------+-------------+
|Store-05|       125000|
|Store-06|       126000|
|Store-03|       136000|
|Store-01|       211000|
|Store-04|       135000|
|Store-07|       144000|
|Store-08|       137000|
|Store-02|        57000|
+--------+-------------+



In [28]:
#7
df_sales.groupBy("region","product").agg(count("*").alias("sales_count")).show()

+------+-------+-----------+
|region|product|sales_count|
+------+-------+-----------+
| North| Laptop|          2|
| North| Tablet|          2|
|  East| Tablet|          2|
|  East| Laptop|          2|
| South| Tablet|          2|
| North| Mobile|          2|
|  West| Tablet|          2|
|  East| Mobile|          2|
| South| Mobile|          2|
| South| Laptop|          2|
|  West| Mobile|          2|
|  West| Laptop|          2|
+------+-------+-----------+



In [29]:
#8
df_sales.groupBy("city").agg(avg("amount").alias("avg_transaction_value")).show()

+----------+---------------------+
|      city|avg_transaction_value|
+----------+---------------------+
| Bangalore|   45333.333333333336|
|     Patna|              42000.0|
|   Chennai|              45000.0|
|    Mumbai|              48000.0|
|   Kolkata|   41666.666666666664|
|      Pune|   45666.666666666664|
|     Delhi|              52750.0|
|Chandigarh|              28500.0|
+----------+---------------------+



In [30]:
#9
df_sales.groupBy("region").agg(sum("amount").alias("total_sales")).filter(col("total_sales")>100000).show()

+------+-----------+
|region|total_sales|
+------+-----------+
| South|     271000|
|  East|     251000|
|  West|     281000|
| North|     268000|
+------+-----------+



In [31]:
#10
df_sales.groupBy("region").agg(sum("amount").alias("total_sales")).explain(True)

== Parsed Logical Plan ==
'Aggregate ['region], ['region, 'sum('amount) AS total_sales#466]
+- LogicalRDD [txn_id#0, region#1, city#2, store_id#3, product#4, sale_date#5, amount#6L], false

== Analyzed Logical Plan ==
region: string, total_sales: bigint
Aggregate [region#1], [region#1, sum(amount#6L) AS total_sales#466L]
+- LogicalRDD [txn_id#0, region#1, city#2, store_id#3, product#4, sale_date#5, amount#6L], false

== Optimized Logical Plan ==
Aggregate [region#1], [region#1, sum(amount#6L) AS total_sales#466L]
+- Project [region#1, amount#6L]
   +- LogicalRDD [txn_id#0, region#1, city#2, store_id#3, product#4, sale_date#5, amount#6L], false

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[region#1], functions=[sum(amount#6L)], output=[region#1, total_sales#466L])
   +- Exchange hashpartitioning(region#1, 200), ENSURE_REQUIREMENTS, [plan_id=685]
      +- HashAggregate(keys=[region#1], functions=[partial_sum(amount#6L)], output=[region#1, sum#476L])
    

#EXERCISE SET 4 — MULTI-DIMENSIONAL AGGREGATION

1. Region + Product wise total sales
2. City + Store wise average sales
3. Region + City wise transaction count
4. Product + Store wise max sale
5. Identify top-selling product per region using aggregation only

In [32]:
#1
df_sales.groupBy("region","product").agg(sum("amount").alias("total_sales")).show()

+------+-------+-----------+
|region|product|total_sales|
+------+-------+-----------+
| North| Laptop|     151000|
| North| Tablet|      54000|
|  East| Tablet|      48000|
|  East| Laptop|     146000|
| South| Tablet|      50000|
| North| Mobile|      63000|
|  West| Tablet|      56000|
|  East| Mobile|      57000|
| South| Mobile|      64000|
| South| Laptop|     157000|
|  West| Mobile|      68000|
|  West| Laptop|     157000|
+------+-------+-----------+



In [33]:
#2
df_sales.groupBy("city","store_id").agg(avg("amount").alias("avg_sales")).show()

+----------+--------+------------------+
|      city|store_id|         avg_sales|
+----------+--------+------------------+
| Bangalore|Store-03|45333.333333333336|
|     Patna|Store-06|           42000.0|
|   Chennai|Store-04|           45000.0|
|      Pune|Store-08|45666.666666666664|
|Chandigarh|Store-02|           28500.0|
|   Kolkata|Store-05|41666.666666666664|
|    Mumbai|Store-07|           48000.0|
|     Delhi|Store-01|           52750.0|
+----------+--------+------------------+



In [34]:
#3
df_sales.groupBy("region","city").agg(count("*").alias("transaction_count")).show()

+------+----------+-----------------+
|region|      city|transaction_count|
+------+----------+-----------------+
|  West|    Mumbai|                3|
| South| Bangalore|                3|
| North|     Delhi|                4|
| North|Chandigarh|                2|
| South|   Chennai|                3|
|  West|      Pune|                3|
|  East|   Kolkata|                3|
|  East|     Patna|                3|
+------+----------+-----------------+



In [35]:
#4
df_sales.groupBy("product","store_id").agg(max("amount").alias("max_sale")).show()

+-------+--------+--------+
|product|store_id|max_sale|
+-------+--------+--------+
| Tablet|Store-06|   23000|
| Laptop|Store-07|   80000|
| Laptop|Store-01|   76000|
| Tablet|Store-02|   26000|
| Mobile|Store-01|   32000|
| Laptop|Store-03|   78000|
| Tablet|Store-08|   27000|
| Tablet|Store-03|   24000|
| Mobile|Store-04|   30000|
| Mobile|Store-07|   35000|
| Mobile|Store-05|   28000|
| Laptop|Store-05|   72000|
| Tablet|Store-01|   28000|
| Tablet|Store-07|   29000|
| Laptop|Store-08|   77000|
| Mobile|Store-08|   33000|
| Laptop|Store-04|   79000|
| Tablet|Store-05|   25000|
| Tablet|Store-04|   26000|
| Laptop|Store-06|   74000|
+-------+--------+--------+
only showing top 20 rows


In [42]:
#5
from pyspark.sql.functions import sum, max

region_product_sales = df_sales.groupBy("region","product").agg(sum("amount").alias("total_sales"))

max_sales = region_product_sales.groupBy("region").agg(max("total_sales").alias("max_sales"))

# Alias the DataFrames to resolve ambiguity
aliased_region_product_sales = region_product_sales.alias("a")
aliased_max_sales = max_sales.alias("b")

aliased_region_product_sales.join(
    aliased_max_sales,
    (aliased_region_product_sales.region == aliased_max_sales.region) & \
    (aliased_region_product_sales.total_sales == aliased_max_sales.max_sales),
    "inner"
).select(aliased_region_product_sales.region, aliased_region_product_sales.product, aliased_region_product_sales.total_sales).show()

+------+-------+-----------+
|region|product|total_sales|
+------+-------+-----------+
| North| Laptop|     151000|
|  East| Laptop|     146000|
| South| Laptop|     157000|
|  West| Laptop|     157000|
+------+-------+-----------+



#EXERCISE SET 5 — WINDOW FUNCTIONS (OVER)

Exercises
1. Compute running total of sales per region ordered by date
2. Rank transactions by amount within each region
3. Assign row numbers per store ordered by sale amount
4. Use dense rank to rank products per region
5. Identify top 2 highest sales per region using window functions
6. Compare rank vs dense_rank output
7. Calculate cumulative sales per store
8. Identify first and last transaction per city using windows

In [43]:
from pyspark.sql.window import Window
from pyspark.sql.functions import sum, rank, row_number, dense_rank

In [44]:
#1
w = Window.partitionBy("region").orderBy("sale_date").rowsBetween(Window.unboundedPreceding, Window.currentRow)
df_sales.withColumn("running_total", sum("amount").over(w)).show()

+------+------+----------+--------+-------+----------+------+-------------+
|txn_id|region|      city|store_id|product| sale_date|amount|running_total|
+------+------+----------+--------+-------+----------+------+-------------+
|  T007|  East|   Kolkata|Store-05| Laptop|2024-01-01| 72000|        72000|
|  T008|  East|   Kolkata|Store-05| Mobile|2024-01-02| 28000|       100000|
|  T009|  East|     Patna|Store-06| Tablet|2024-01-03| 23000|       123000|
|  T015|  East|     Patna|Store-06| Mobile|2024-01-04| 29000|       152000|
|  T019|  East|   Kolkata|Store-05| Tablet|2024-01-05| 25000|       177000|
|  T023|  East|     Patna|Store-06| Laptop|2024-01-06| 74000|       251000|
|  T001| North|     Delhi|Store-01| Laptop|2024-01-01| 75000|        75000|
|  T002| North|     Delhi|Store-01| Mobile|2024-01-02| 32000|       107000|
|  T003| North|Chandigarh|Store-02| Tablet|2024-01-03| 26000|       133000|
|  T013| North|     Delhi|Store-01| Laptop|2024-01-04| 76000|       209000|
|  T017| Nor

In [45]:
#2
w = Window.partitionBy("region").orderBy(col("amount").desc())
df_sales.withColumn("rank", rank().over(w)).show()

+------+------+----------+--------+-------+----------+------+----+
|txn_id|region|      city|store_id|product| sale_date|amount|rank|
+------+------+----------+--------+-------+----------+------+----+
|  T023|  East|     Patna|Store-06| Laptop|2024-01-06| 74000|   1|
|  T007|  East|   Kolkata|Store-05| Laptop|2024-01-01| 72000|   2|
|  T015|  East|     Patna|Store-06| Mobile|2024-01-04| 29000|   3|
|  T008|  East|   Kolkata|Store-05| Mobile|2024-01-02| 28000|   4|
|  T019|  East|   Kolkata|Store-05| Tablet|2024-01-05| 25000|   5|
|  T009|  East|     Patna|Store-06| Tablet|2024-01-03| 23000|   6|
|  T013| North|     Delhi|Store-01| Laptop|2024-01-04| 76000|   1|
|  T001| North|     Delhi|Store-01| Laptop|2024-01-01| 75000|   2|
|  T002| North|     Delhi|Store-01| Mobile|2024-01-02| 32000|   3|
|  T017| North|Chandigarh|Store-02| Mobile|2024-01-05| 31000|   4|
|  T021| North|     Delhi|Store-01| Tablet|2024-01-06| 28000|   5|
|  T003| North|Chandigarh|Store-02| Tablet|2024-01-03| 26000| 

In [46]:
#3
w = Window.partitionBy("store_id").orderBy(col("amount").desc())
df_sales.withColumn("row_number", row_number().over(w)).show()

+------+------+----------+--------+-------+----------+------+----------+
|txn_id|region|      city|store_id|product| sale_date|amount|row_number|
+------+------+----------+--------+-------+----------+------+----------+
|  T013| North|     Delhi|Store-01| Laptop|2024-01-04| 76000|         1|
|  T001| North|     Delhi|Store-01| Laptop|2024-01-01| 75000|         2|
|  T002| North|     Delhi|Store-01| Mobile|2024-01-02| 32000|         3|
|  T021| North|     Delhi|Store-01| Tablet|2024-01-06| 28000|         4|
|  T017| North|Chandigarh|Store-02| Mobile|2024-01-05| 31000|         1|
|  T003| North|Chandigarh|Store-02| Tablet|2024-01-03| 26000|         2|
|  T004| South| Bangalore|Store-03| Laptop|2024-01-01| 78000|         1|
|  T018| South| Bangalore|Store-03| Mobile|2024-01-05| 34000|         2|
|  T006| South| Bangalore|Store-03| Tablet|2024-01-03| 24000|         3|
|  T014| South|   Chennai|Store-04| Laptop|2024-01-04| 79000|         1|
|  T005| South|   Chennai|Store-04| Mobile|2024-01-

In [47]:
#4
w = Window.partitionBy("region").orderBy(col("amount").desc())
df_sales.withColumn("dense_rank", dense_rank().over(w)).show()

+------+------+----------+--------+-------+----------+------+----------+
|txn_id|region|      city|store_id|product| sale_date|amount|dense_rank|
+------+------+----------+--------+-------+----------+------+----------+
|  T023|  East|     Patna|Store-06| Laptop|2024-01-06| 74000|         1|
|  T007|  East|   Kolkata|Store-05| Laptop|2024-01-01| 72000|         2|
|  T015|  East|     Patna|Store-06| Mobile|2024-01-04| 29000|         3|
|  T008|  East|   Kolkata|Store-05| Mobile|2024-01-02| 28000|         4|
|  T019|  East|   Kolkata|Store-05| Tablet|2024-01-05| 25000|         5|
|  T009|  East|     Patna|Store-06| Tablet|2024-01-03| 23000|         6|
|  T013| North|     Delhi|Store-01| Laptop|2024-01-04| 76000|         1|
|  T001| North|     Delhi|Store-01| Laptop|2024-01-01| 75000|         2|
|  T002| North|     Delhi|Store-01| Mobile|2024-01-02| 32000|         3|
|  T017| North|Chandigarh|Store-02| Mobile|2024-01-05| 31000|         4|
|  T021| North|     Delhi|Store-01| Tablet|2024-01-

In [48]:
#5
df_sales.withColumn("rank", rank().over(w)).filter(col("rank")<=2).show()

+------+------+---------+--------+-------+----------+------+----+
|txn_id|region|     city|store_id|product| sale_date|amount|rank|
+------+------+---------+--------+-------+----------+------+----+
|  T023|  East|    Patna|Store-06| Laptop|2024-01-06| 74000|   1|
|  T007|  East|  Kolkata|Store-05| Laptop|2024-01-01| 72000|   2|
|  T013| North|    Delhi|Store-01| Laptop|2024-01-04| 76000|   1|
|  T001| North|    Delhi|Store-01| Laptop|2024-01-01| 75000|   2|
|  T014| South|  Chennai|Store-04| Laptop|2024-01-04| 79000|   1|
|  T004| South|Bangalore|Store-03| Laptop|2024-01-01| 78000|   2|
|  T010|  West|   Mumbai|Store-07| Laptop|2024-01-01| 80000|   1|
|  T016|  West|     Pune|Store-08| Laptop|2024-01-04| 77000|   2|
+------+------+---------+--------+-------+----------+------+----+



In [49]:
#6
df_sales.select("region","amount",rank().over(w).alias("rank"),dense_rank().over(w).alias("dense_rank")).show()

+------+------+----+----------+
|region|amount|rank|dense_rank|
+------+------+----+----------+
|  East| 74000|   1|         1|
|  East| 72000|   2|         2|
|  East| 29000|   3|         3|
|  East| 28000|   4|         4|
|  East| 25000|   5|         5|
|  East| 23000|   6|         6|
| North| 76000|   1|         1|
| North| 75000|   2|         2|
| North| 32000|   3|         3|
| North| 31000|   4|         4|
| North| 28000|   5|         5|
| North| 26000|   6|         6|
| South| 79000|   1|         1|
| South| 78000|   2|         2|
| South| 34000|   3|         3|
| South| 30000|   4|         4|
| South| 26000|   5|         5|
| South| 24000|   6|         6|
|  West| 80000|   1|         1|
|  West| 77000|   2|         2|
+------+------+----+----------+
only showing top 20 rows


In [50]:
#7
w = Window.partitionBy("store_id").orderBy("sale_date").rowsBetween(Window.unboundedPreceding, Window.currentRow)
df_sales.withColumn("cumulative_sales", sum("amount").over(w)).show()

+------+------+----------+--------+-------+----------+------+----------------+
|txn_id|region|      city|store_id|product| sale_date|amount|cumulative_sales|
+------+------+----------+--------+-------+----------+------+----------------+
|  T001| North|     Delhi|Store-01| Laptop|2024-01-01| 75000|           75000|
|  T002| North|     Delhi|Store-01| Mobile|2024-01-02| 32000|          107000|
|  T013| North|     Delhi|Store-01| Laptop|2024-01-04| 76000|          183000|
|  T021| North|     Delhi|Store-01| Tablet|2024-01-06| 28000|          211000|
|  T003| North|Chandigarh|Store-02| Tablet|2024-01-03| 26000|           26000|
|  T017| North|Chandigarh|Store-02| Mobile|2024-01-05| 31000|           57000|
|  T004| South| Bangalore|Store-03| Laptop|2024-01-01| 78000|           78000|
|  T006| South| Bangalore|Store-03| Tablet|2024-01-03| 24000|          102000|
|  T018| South| Bangalore|Store-03| Mobile|2024-01-05| 34000|          136000|
|  T005| South|   Chennai|Store-04| Mobile|2024-01-0

In [51]:
#8
w_asc = Window.partitionBy("city").orderBy("sale_date")
w_desc = Window.partitionBy("city").orderBy(col("sale_date").desc())

df_sales.withColumn("first_txn",row_number().over(w_asc))\
.withColumn("last_txn",row_number().over(w_desc)).show()

+------+------+----------+--------+-------+----------+------+---------+--------+
|txn_id|region|      city|store_id|product| sale_date|amount|first_txn|last_txn|
+------+------+----------+--------+-------+----------+------+---------+--------+
|  T018| South| Bangalore|Store-03| Mobile|2024-01-05| 34000|        3|       1|
|  T006| South| Bangalore|Store-03| Tablet|2024-01-03| 24000|        2|       2|
|  T004| South| Bangalore|Store-03| Laptop|2024-01-01| 78000|        1|       3|
|  T017| North|Chandigarh|Store-02| Mobile|2024-01-05| 31000|        2|       1|
|  T003| North|Chandigarh|Store-02| Tablet|2024-01-03| 26000|        1|       2|
|  T022| South|   Chennai|Store-04| Tablet|2024-01-06| 26000|        3|       1|
|  T014| South|   Chennai|Store-04| Laptop|2024-01-04| 79000|        2|       2|
|  T005| South|   Chennai|Store-04| Mobile|2024-01-02| 30000|        1|       3|
|  T021| North|     Delhi|Store-01| Tablet|2024-01-06| 28000|        4|       1|
|  T013| North|     Delhi|St

#EXERCISE SET 7 — DAG & PERFORMANCE OBSERVATION

In [None]:
Exercises
1. Run explain(True) for:
Simple select
Filter
GroupBy
Window function
2. Identify:
Shuffles
Exchanges
Sorts
3. Explain why window functions introduce sorting

In [52]:
#1
df_sales.select("region","amount").explain(True)

== Parsed Logical Plan ==
'Project ['region, 'amount]
+- LogicalRDD [txn_id#0, region#1, city#2, store_id#3, product#4, sale_date#5, amount#6L], false

== Analyzed Logical Plan ==
region: string, amount: bigint
Project [region#1, amount#6L]
+- LogicalRDD [txn_id#0, region#1, city#2, store_id#3, product#4, sale_date#5, amount#6L], false

== Optimized Logical Plan ==
Project [region#1, amount#6L]
+- LogicalRDD [txn_id#0, region#1, city#2, store_id#3, product#4, sale_date#5, amount#6L], false

== Physical Plan ==
*(1) Project [region#1, amount#6L]
+- *(1) Scan ExistingRDD[txn_id#0,region#1,city#2,store_id#3,product#4,sale_date#5,amount#6L]



In [53]:
df_sales.filter(col("amount")>30000).explain(True)

== Parsed Logical Plan ==
'Filter '`>`('amount, 30000)
+- LogicalRDD [txn_id#0, region#1, city#2, store_id#3, product#4, sale_date#5, amount#6L], false

== Analyzed Logical Plan ==
txn_id: string, region: string, city: string, store_id: string, product: string, sale_date: string, amount: bigint
Filter (amount#6L > cast(30000 as bigint))
+- LogicalRDD [txn_id#0, region#1, city#2, store_id#3, product#4, sale_date#5, amount#6L], false

== Optimized Logical Plan ==
Filter (isnotnull(amount#6L) AND (amount#6L > 30000))
+- LogicalRDD [txn_id#0, region#1, city#2, store_id#3, product#4, sale_date#5, amount#6L], false

== Physical Plan ==
*(1) Filter (isnotnull(amount#6L) AND (amount#6L > 30000))
+- *(1) Scan ExistingRDD[txn_id#0,region#1,city#2,store_id#3,product#4,sale_date#5,amount#6L]



In [54]:
df_sales.groupBy("region").sum("amount").explain(True)

== Parsed Logical Plan ==
'Aggregate ['region], ['region, unresolvedalias('sum(amount#6L))]
+- LogicalRDD [txn_id#0, region#1, city#2, store_id#3, product#4, sale_date#5, amount#6L], false

== Analyzed Logical Plan ==
region: string, sum(amount): bigint
Aggregate [region#1], [region#1, sum(amount#6L) AS sum(amount)#979L]
+- LogicalRDD [txn_id#0, region#1, city#2, store_id#3, product#4, sale_date#5, amount#6L], false

== Optimized Logical Plan ==
Aggregate [region#1], [region#1, sum(amount#6L) AS sum(amount)#979L]
+- Project [region#1, amount#6L]
   +- LogicalRDD [txn_id#0, region#1, city#2, store_id#3, product#4, sale_date#5, amount#6L], false

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[region#1], functions=[sum(amount#6L)], output=[region#1, sum(amount)#979L])
   +- Exchange hashpartitioning(region#1, 200), ENSURE_REQUIREMENTS, [plan_id=1757]
      +- HashAggregate(keys=[region#1], functions=[partial_sum(amount#6L)], output=[region#1, sum#981L])
   

In [55]:
df_sales.withColumn("rank",rank().over(w)).explain(True)

== Parsed Logical Plan ==
'Project [unresolvedstarwithcolumns(rank, 'rank() windowspecdefinition('store_id, 'sale_date ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())), None)]
+- LogicalRDD [txn_id#0, region#1, city#2, store_id#3, product#4, sale_date#5, amount#6L], false

== Analyzed Logical Plan ==
txn_id: string, region: string, city: string, store_id: string, product: string, sale_date: string, amount: bigint, rank: int
Project [txn_id#0, region#1, city#2, store_id#3, product#4, sale_date#5, amount#6L, rank#982]
+- Project [txn_id#0, region#1, city#2, store_id#3, product#4, sale_date#5, amount#6L, rank#982, rank#982]
   +- Window [rank(sale_date#5) windowspecdefinition(store_id#3, sale_date#5 ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS rank#982], [store_id#3], [sale_date#5 ASC NULLS FIRST]
      +- Project [txn_id#0, region#1, city#2, store_id#3, product#4, sale_date#5, amount#6L]
         +- Logica