In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window


In [3]:
spark = SparkSession.builder.appName("Management Upstream").getOrCreate()

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
csv_path = "/content/drive/MyDrive/Colab Notebooks/orders_large_bad.csv"
json_path = "/content/drive/MyDrive/Colab Notebooks/orders_large_bad.json"

PHASE 1 — INGESTION & FIRST
INSPECTION

Exercises

1. Read the CSV file into a DataFrame
2. Disable schema inference and read everything as string
3. Print schema and record count
4. Display 20 random rows
5. Identify at least 5 data quality issues by observation
6. Read the JSON file and compare schema and row count with CSV

In [7]:
#1,2,3
df_csv_raw = (spark.read.option("header", "True")
              .option("inferSchema", "False").csv(csv_path))
df_csv_raw.printSchema()
df_csv_raw.count()

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- city: string (nullable = true)
 |-- category: string (nullable = true)
 |-- product: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- order_date: string (nullable = true)
 |-- status: string (nullable = true)



300000

In [8]:
#4
import pyspark.sql.functions as F
df_csv_raw.orderBy(F.rand()).show(20,truncate = False)

+-----------+-----------+---------+-------------+-------+-------+----------+---------+
|order_id   |customer_id|city     |category     |product|amount |order_date|status   |
+-----------+-----------+---------+-------------+-------+-------+----------+---------+
|ORD00247401|C047401    | chennai |Home         |Vacuum |30435  |22/01/2024|Completed|
|ORD00149501|C049501    |Bangalore|Grocery      |Sugar  |38299  |11/02/2024|Completed|
|ORD00085352|C035352    |Pune     |Fashion      |Jeans  |1656   |2024-02-02|Completed|
|ORD00281553|C031553    |Pune     |Home         |Vacuum |37110  |2024-02-03|Completed|
|ORD00019411|C019411    |Hyderabad|Electronics  |Laptop |64240  |2024-02-01|Completed|
|ORD00192860|C042860    |Delhi    |Fashion      |Jeans  |36134  |2024-01-21|Cancelled|
|ORD00275215|C025215    |Bangalore|Fashion      |Shoes  |invalid|2024-02-25|Completed|
|ORD00263481|C013481    |Pune     |Electronics  |Mobile |33568  |2024-01-22|Completed|
|ORD00275938|C025938    |Pune     |Fashion 

In [None]:
#5
# 1. amount contains "invalid" strings
# 2. amount contains commas ("12,000")
# 3. Mixed order_date formats
# 4. status = Cancelled rows present
# 5. Data in wrong columns
# 6. Inconsistent casing in city, category, product

In [9]:
#6
df_json_raw = (spark.read.option("header", "True")
              .option("inferSchema", "False").json(json_path))
df_json_raw.printSchema()
df_json_raw.count()

root
 |-- amount: string (nullable = true)
 |-- category: string (nullable = true)
 |-- city: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- order_date: string (nullable = true)
 |-- order_id: string (nullable = true)
 |-- product: string (nullable = true)
 |-- status: string (nullable = true)



300000

PHASE 2 — SCHEMA ENFORCEMENT &
VALIDATION

Exercises

7. Define an explicit schema using StructType
8. Re-read the CSV using the defined schema
9. Identify rows that fail schema expectations
10. Explain why schema inference is dangerous at scale

In [10]:
#7
csv_orders_schema = StructType([
    StructField("order_id", StringType()),
    StructField("customer_id", StringType()),
    StructField("city", StringType()),
    StructField("category", StringType()),
    StructField("product", StringType()),
    StructField("amount", StringType()),
    StructField("order_date", StringType()),
    StructField("status", StringType())
])

In [12]:
#8
csv_orders_df = spark.read \
.option("header", "true") \
.schema(csv_orders_schema) \
.csv(csv_path)
csv_orders_df.printSchema()
csv_orders_df.show(5)

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- city: string (nullable = true)
 |-- category: string (nullable = true)
 |-- product: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- order_date: string (nullable = true)
 |-- status: string (nullable = true)

+-----------+-----------+-----------+-----------+-----------+-------+----------+---------+
|   order_id|customer_id|       city|   category|    product| amount|order_date|   status|
+-----------+-----------+-----------+-----------+-----------+-------+----------+---------+
|ORD00000000|    C000000| hyderabad |   grocery |       Oil |invalid|01/01/2024|Cancelled|
|ORD00000001|    C000001|       Pune|    Grocery|      Sugar|  35430|2024-01-02|Completed|
|ORD00000002|    C000002|       Pune|Electronics|     Mobile|  65358|2024-01-03|Completed|
|ORD00000003|    C000003|  Bangalore|Electronics|     Laptop|   5558|2024-01-04|Completed|
|ORD00000004|    C000004|       Pune|    

In [13]:
#9
invalid_schema_df = csv_orders_df.filter(csv_orders_df.order_id.isNull() | csv_orders_df.amount.isNull())
invalid_schema_df.show()

+-----------+-----------+---------+-----------+-----------+------+----------+---------+
|   order_id|customer_id|     city|   category|    product|amount|order_date|   status|
+-----------+-----------+---------+-----------+-----------+------+----------+---------+
|ORD00000029|    C000029|Bangalore|    Grocery|      Sugar|  NULL|2024-01-30|Completed|
|ORD00000058|    C000058|   Mumbai|    Grocery|        Oil|  NULL|2024-02-28|Completed|
|ORD00000087|    C000087|    Delhi|Electronics|     Tablet|  NULL|2024-01-28|Completed|
|ORD00000116|    C000116|Bangalore|    Grocery|      Sugar|  NULL|2024-02-26|Completed|
|ORD00000145|    C000145|    Delhi|       Home|      Mixer|  NULL|2024-01-26|Completed|
|ORD00000174|    C000174|Bangalore|    Grocery|        Oil|  NULL|2024-02-24|Completed|
|ORD00000203|    C000203|Hyderabad|       Home|      Mixer|  NULL|2024-01-24|Completed|
|ORD00000232|    C000232|     Pune|    Fashion|     TShirt|  NULL|2024-02-22|Completed|
|ORD00000261|    C000261|  Kolka

In [None]:
#10
#Slow: Needs full/large scan.
#Wrong types: Mixed data → errors.
#Latency: Adds ingestion delay.
#Corruption: Wrong schema → lost data.
#Sampling fails: Bad sample → bad schema.
#Unstable: Different runs, different results.
#JSON pain: Nested = unpredictable.

PHASE 3 — STRING CLEANING &
STANDARDIZATION

Exercises

11. Trim leading and trailing spaces from all string columns
12. Standardize city , category , and product values
13. Convert all categorical columns to a consistent case
14. Identify how many distinct city values existed before vs after cleaning

In [15]:
#11
str_cols = csv_orders_df.columns

for col in str_cols:
  csv_orders_df = csv_orders_df.withColumn(col, F.trim(F.col(col)))
csv_orders_df.show(10, truncate=False)

+-----------+-----------+---------+-----------+-----------+-------+----------+---------+
|order_id   |customer_id|city     |category   |product    |amount |order_date|status   |
+-----------+-----------+---------+-----------+-----------+-------+----------+---------+
|ORD00000000|C000000    |hyderabad|grocery    |Oil        |invalid|01/01/2024|Cancelled|
|ORD00000001|C000001    |Pune     |Grocery    |Sugar      |35430  |2024-01-02|Completed|
|ORD00000002|C000002    |Pune     |Electronics|Mobile     |65358  |2024-01-03|Completed|
|ORD00000003|C000003    |Bangalore|Electronics|Laptop     |5558   |2024-01-04|Completed|
|ORD00000004|C000004    |Pune     |Home       |AirPurifier|33659  |2024-01-05|Completed|
|ORD00000005|C000005    |Delhi    |Fashion    |Jeans      |8521   |2024-01-06|Completed|
|ORD00000006|C000006    |Delhi    |Grocery    |Sugar      |42383  |2024-01-07|Completed|
|ORD00000007|C000007    |Pune     |Grocery    |Rice       |45362  |2024-01-08|Completed|
|ORD00000008|C000008 

In [16]:
#12
csv_orders_df = csv_orders_df.withColumn("city", F.upper("city")).withColumn("category", F.upper("category")) \
.withColumn("product", F.upper("product"))

csv_orders_df.select("city", "category", "product").show(10, truncate=False)

+---------+-----------+-----------+
|city     |category   |product    |
+---------+-----------+-----------+
|HYDERABAD|GROCERY    |OIL        |
|PUNE     |GROCERY    |SUGAR      |
|PUNE     |ELECTRONICS|MOBILE     |
|BANGALORE|ELECTRONICS|LAPTOP     |
|PUNE     |HOME       |AIRPURIFIER|
|DELHI    |FASHION    |JEANS      |
|DELHI    |GROCERY    |SUGAR      |
|PUNE     |GROCERY    |RICE       |
|BANGALORE|FASHION    |JEANS      |
|KOLKATA  |ELECTRONICS|LAPTOP     |
+---------+-----------+-----------+
only showing top 10 rows


In [18]:
#13
csv_orders_df = csv_orders_df.withColumn("status", F.upper("status"))
csv_orders_df.select("status").distinct().show(truncate=False)

+---------+
|status   |
+---------+
|CANCELLED|
|COMPLETED|
+---------+



In [19]:
#14
csv_orders_df.select("city").distinct().show(20, truncate=False)

+---------+
|city     |
+---------+
|KOLKATA  |
|BANGALORE|
|DELHI    |
|HYDERABAD|
|CHENNAI  |
|PUNE     |
|MUMBAI   |
+---------+



PHASE 4 — AMOUNT CLEANING (CRITICAL)

Exercises

15. Identify invalid values in the amount column
16. Remove commas from numeric strings
17. Convert amount to IntegerType safely
18. Handle empty, null, and invalid values explicitly
19. Count how many records were affected during amount cleaning

In [20]:
#15
csv_orders_df.filter(~F.col("amount").rlike("^[0-9,]+$")).show(20, truncate=False)

+-----------+-----------+---------+-----------+-----------+-------+----------+---------+
|order_id   |customer_id|city     |category   |product    |amount |order_date|status   |
+-----------+-----------+---------+-----------+-----------+-------+----------+---------+
|ORD00000000|C000000    |HYDERABAD|GROCERY    |OIL        |invalid|01/01/2024|CANCELLED|
|ORD00000019|C000019    |MUMBAI   |ELECTRONICS|MOBILE     |invalid|2024-01-20|COMPLETED|
|ORD00000038|C000038    |DELHI    |HOME       |VACUUM     |invalid|2024-02-08|COMPLETED|
|ORD00000057|C000057    |KOLKATA  |HOME       |AIRPURIFIER|invalid|2024-02-27|COMPLETED|
|ORD00000076|C000076    |MUMBAI   |HOME       |AIRPURIFIER|invalid|2024-01-17|COMPLETED|
|ORD00000095|C000095    |HYDERABAD|ELECTRONICS|MOBILE     |invalid|2024-02-05|COMPLETED|
|ORD00000114|C000114    |BANGALORE|HOME       |AIRPURIFIER|invalid|2024-02-24|COMPLETED|
|ORD00000133|C000133    |KOLKATA  |HOME       |VACUUM     |invalid|2024-01-14|COMPLETED|
|ORD00000152|C000152 

In [21]:
#16
csv_orders_df = csv_orders_df.withColumn("amount_clean", F.regexp_replace("amount", ",", ""))
csv_orders_df.select("amount", "amount_clean").show(20, truncate=False)

+-------+------------+
|amount |amount_clean|
+-------+------------+
|invalid|invalid     |
|35430  |35430       |
|65358  |65358       |
|5558   |5558        |
|33659  |33659       |
|8521   |8521        |
|42383  |42383       |
|45362  |45362       |
|10563  |10563       |
|63715  |63715       |
|66576  |66576       |
|50318  |50318       |
|84768  |84768       |
|79121  |79121       |
|79469  |79469       |
|81018  |81018       |
|64225  |64225       |
|69582  |69582       |
|50424  |50424       |
|invalid|invalid     |
+-------+------------+
only showing top 20 rows


In [22]:
#17
csv_orders_df = csv_orders_df.withColumn("amount_int",F.when(F.col("amount_clean")\
                                                             .rlike("^[0-9]+$"),F.col("amount_clean").cast("int")))

csv_orders_df.select("amount_clean", "amount_int").show(20, truncate=False)

+------------+----------+
|amount_clean|amount_int|
+------------+----------+
|invalid     |NULL      |
|35430       |35430     |
|65358       |65358     |
|5558        |5558      |
|33659       |33659     |
|8521        |8521      |
|42383       |42383     |
|45362       |45362     |
|10563       |10563     |
|63715       |63715     |
|66576       |66576     |
|50318       |50318     |
|84768       |84768     |
|79121       |79121     |
|79469       |79469     |
|81018       |81018     |
|64225       |64225     |
|69582       |69582     |
|50424       |50424     |
|invalid     |NULL      |
+------------+----------+
only showing top 20 rows


In [23]:
#18
csv_orders_df.filter(F.col("amount_int").isNull()).show(20, truncate=False)

+-----------+-----------+---------+-----------+-----------+-------+----------+---------+------------+----------+
|order_id   |customer_id|city     |category   |product    |amount |order_date|status   |amount_clean|amount_int|
+-----------+-----------+---------+-----------+-----------+-------+----------+---------+------------+----------+
|ORD00000000|C000000    |HYDERABAD|GROCERY    |OIL        |invalid|01/01/2024|CANCELLED|invalid     |NULL      |
|ORD00000019|C000019    |MUMBAI   |ELECTRONICS|MOBILE     |invalid|2024-01-20|COMPLETED|invalid     |NULL      |
|ORD00000029|C000029    |BANGALORE|GROCERY    |SUGAR      |NULL   |2024-01-30|COMPLETED|NULL        |NULL      |
|ORD00000038|C000038    |DELHI    |HOME       |VACUUM     |invalid|2024-02-08|COMPLETED|invalid     |NULL      |
|ORD00000057|C000057    |KOLKATA  |HOME       |AIRPURIFIER|invalid|2024-02-27|COMPLETED|invalid     |NULL      |
|ORD00000058|C000058    |MUMBAI   |GROCERY    |OIL        |NULL   |2024-02-28|COMPLETED|NULL    

In [24]:
#19
csv_orders_df.filter(F.col("amount_int").isNull()).count()

25164

PHASE 5 — DATE PARSING &
NORMALIZATION

Exercises

20. Identify all date formats present in order_date
21. Parse valid dates into DateType
22. Handle invalid dates gracefully
23. Create a clean order_date_clean column
24. Count records with invalid dates

In [25]:
#20
csv_orders_df.select("order_date").distinct().show(20, False)

+----------+
|order_date|
+----------+
|2024-01-19|
|2024/01/02|
|30/01/2024|
|2024-02-08|
|2024-02-28|
|2024-01-13|
|2024/01/14|
|18/01/2024|
|27/01/2024|
|06/01/2024|
|2024-02-20|
|2024-01-06|
|2024/01/09|
|16/02/2024|
|2024-02-04|
|15/01/2024|
|2024-02-15|
|2024/02/23|
|2024-02-12|
|2024/02/25|
+----------+
only showing top 20 rows


In [26]:
#21
csv_orders_df = csv_orders_df.withColumn("order_date_clean",F.coalesce(
        F.expr("try_to_timestamp(order_date, 'yyyy-MM-dd')").cast("date"),
        F.expr("try_to_timestamp(order_date, 'yyyy/MM/dd')").cast("date"),
        F.expr("try_to_timestamp(order_date, 'dd/MM/yyyy')").cast("date"),
        F.expr("try_to_timestamp(order_date, 'dd-MM-yyyy')").cast("date")
    )
)
csv_orders_df.select("order_date", "order_date_clean").show(20, truncate=False)

+----------+----------------+
|order_date|order_date_clean|
+----------+----------------+
|01/01/2024|2024-01-01      |
|2024-01-02|2024-01-02      |
|2024-01-03|2024-01-03      |
|2024-01-04|2024-01-04      |
|2024-01-05|2024-01-05      |
|2024-01-06|2024-01-06      |
|2024-01-07|2024-01-07      |
|2024-01-08|2024-01-08      |
|2024-01-09|2024-01-09      |
|2024-01-10|2024-01-10      |
|2024-01-11|2024-01-11      |
|12/01/2024|2024-01-12      |
|2024-01-13|2024-01-13      |
|2024/01/14|2024-01-14      |
|2024-01-15|2024-01-15      |
|2024-01-16|2024-01-16      |
|2024-01-17|2024-01-17      |
|2024-01-18|2024-01-18      |
|2024-01-19|2024-01-19      |
|2024-01-20|2024-01-20      |
+----------+----------------+
only showing top 20 rows


In [27]:
#22
csv_orders_df.filter(F.col("order_date_clean").isNull()).select("order_date").show(20, truncate=False)

+------------+
|order_date  |
+------------+
|invalid_date|
|invalid_date|
|invalid_date|
|invalid_date|
|invalid_date|
|invalid_date|
|invalid_date|
|invalid_date|
|invalid_date|
|invalid_date|
|invalid_date|
|invalid_date|
|invalid_date|
|invalid_date|
|invalid_date|
|invalid_date|
|invalid_date|
|invalid_date|
|invalid_date|
|invalid_date|
+------------+
only showing top 20 rows


In [28]:
#23
csv_orders_df.select("order_id", "order_date", "order_date_clean").show(20, truncate=False)

+-----------+----------+----------------+
|order_id   |order_date|order_date_clean|
+-----------+----------+----------------+
|ORD00000000|01/01/2024|2024-01-01      |
|ORD00000001|2024-01-02|2024-01-02      |
|ORD00000002|2024-01-03|2024-01-03      |
|ORD00000003|2024-01-04|2024-01-04      |
|ORD00000004|2024-01-05|2024-01-05      |
|ORD00000005|2024-01-06|2024-01-06      |
|ORD00000006|2024-01-07|2024-01-07      |
|ORD00000007|2024-01-08|2024-01-08      |
|ORD00000008|2024-01-09|2024-01-09      |
|ORD00000009|2024-01-10|2024-01-10      |
|ORD00000010|2024-01-11|2024-01-11      |
|ORD00000011|12/01/2024|2024-01-12      |
|ORD00000012|2024-01-13|2024-01-13      |
|ORD00000013|2024/01/14|2024-01-14      |
|ORD00000014|2024-01-15|2024-01-15      |
|ORD00000015|2024-01-16|2024-01-16      |
|ORD00000016|2024-01-17|2024-01-17      |
|ORD00000017|2024-01-18|2024-01-18      |
|ORD00000018|2024-01-19|2024-01-19      |
|ORD00000019|2024-01-20|2024-01-20      |
+-----------+----------+----------

In [29]:
#24
csv_orders_df.filter(F.col("order_date_clean").isNull()).count()

2595

PHASE 6 — BUSINESS FILTERING &
DEDUPLICATION

Exercises

25. Identify duplicate order_id values
26. Remove duplicate orders safely
27. Keep only records with status = Completed
28. Validate record counts before and after filtering

In [30]:
#25
csv_orders_df.groupBy("order_id").count().filter(F.col("count")>1).show(20, truncate=False)

+--------+-----+
|order_id|count|
+--------+-----+
+--------+-----+



In [32]:
#26
csv_orders_df = csv_orders_df.dropDuplicates(["order_id"])
csv_orders_df.count()

before_filter_count = csv_orders_df.count()
print("Record count before Filtering: ", before_filter_count)

Record count before Filtering:  300000


In [33]:
#27
csv_orders_df = csv_orders_df.filter(F.col("status") == "COMPLETED")
after_filter_count = csv_orders_df.count()
csv_orders_df.count()

285000

In [34]:
#28
print("Record count before Filtering: ", before_filter_count)
print("Record count after Filtering: ", after_filter_count)

Record count before Filtering:  300000
Record count after Filtering:  285000


PHASE 7 — PERFORMANCE & PARTITION
AWARENESS

Exercises

29. Check the default number of partitions
30. Run a heavy groupBy and observe execution time
31. Use explain(True) to identify shuffle stages
32. Repartition the DataFrame by city
33. Compare execution plans before and after repartition

In [35]:
#29
csv_orders_df.rdd.getNumPartitions()

2

In [36]:
#30
csv_orders_df.groupBy("city").agg(F.sum("amount_int")).show(20, truncate=False)

+---------+---------------+
|city     |sum(amount_int)|
+---------+---------------+
|KOLKATA  |1624300497     |
|BANGALORE|1628527093     |
|DELHI    |1639639916     |
|HYDERABAD|1642443340     |
|CHENNAI  |1629865247     |
|PUNE     |1646196535     |
|MUMBAI   |1625518096     |
+---------+---------------+



In [37]:
#31
csv_orders_df.groupBy("city").agg(F.sum("amount_int")).explain(True)

== Parsed Logical Plan ==
'Aggregate ['city], ['city, unresolvedalias('sum('amount_int))]
+- Filter (status#535 = COMPLETED)
   +- Deduplicate [order_id#478]
      +- Deduplicate [order_id#478]
         +- Project [order_id#478, customer_id#479, city#519, category#520, product#521, amount#483, order_date#484, status#535, amount_clean#579, amount_int#588, coalesce(cast(try_to_timestamp(order_date#484, Some(yyyy-MM-dd), TimestampType, Some(Etc/UTC), false) as date), cast(try_to_timestamp(order_date#484, Some(yyyy/MM/dd), TimestampType, Some(Etc/UTC), false) as date), cast(try_to_timestamp(order_date#484, Some(dd/MM/yyyy), TimestampType, Some(Etc/UTC), false) as date), cast(try_to_timestamp(order_date#484, Some(dd-MM-yyyy), TimestampType, Some(Etc/UTC), false) as date)) AS order_date_clean#656]
            +- Project [order_id#478, customer_id#479, city#519, category#520, product#521, amount#483, order_date#484, status#535, amount_clean#579, CASE WHEN RLIKE(amount_clean#579, ^[0-9]+$) THE

In [38]:
#32
csv_orders_df = csv_orders_df.repartition("city")
csv_orders_df.rdd.getNumPartitions()

3

In [39]:
#33
csv_orders_df.groupBy("city").agg(F.sum("amount_int")).explain(True)

== Parsed Logical Plan ==
'Aggregate ['city], ['city, unresolvedalias('sum('amount_int))]
+- RepartitionByExpression [city#519]
   +- Filter (status#535 = COMPLETED)
      +- Deduplicate [order_id#478]
         +- Deduplicate [order_id#478]
            +- Project [order_id#478, customer_id#479, city#519, category#520, product#521, amount#483, order_date#484, status#535, amount_clean#579, amount_int#588, coalesce(cast(try_to_timestamp(order_date#484, Some(yyyy-MM-dd), TimestampType, Some(Etc/UTC), false) as date), cast(try_to_timestamp(order_date#484, Some(yyyy/MM/dd), TimestampType, Some(Etc/UTC), false) as date), cast(try_to_timestamp(order_date#484, Some(dd/MM/yyyy), TimestampType, Some(Etc/UTC), false) as date), cast(try_to_timestamp(order_date#484, Some(dd-MM-yyyy), TimestampType, Some(Etc/UTC), false) as date)) AS order_date_clean#656]
               +- Project [order_id#478, customer_id#479, city#519, category#520, product#521, amount#483, order_date#484, status#535, amount_clean

PHASE 8 — ANALYTICS ON LARGE DATA

Exercises

34. Calculate total revenue per city
35. Calculate total revenue per category
36. Calculate total revenue per product
37. Identify top 10 products by revenue
38. Calculate average order value per city

In [40]:
#34
city_revenue_df = csv_orders_df.groupBy("city").agg(F.sum("amount_int").alias("total_revenue"))
city_revenue_df.show(20, truncate=False)

+---------+-------------+
|city     |total_revenue|
+---------+-------------+
|KOLKATA  |1624300497   |
|BANGALORE|1628527093   |
|DELHI    |1639639916   |
|HYDERABAD|1642443340   |
|CHENNAI  |1629865247   |
|PUNE     |1646196535   |
|MUMBAI   |1625518096   |
+---------+-------------+



In [41]:
#35
csv_orders_df.groupBy("category").agg(F.sum("amount_int").alias("total_revenue")).show(20, truncate=False)

+-----------+-------------+
|category   |total_revenue|
+-----------+-------------+
|HOME       |2868467576   |
|ELECTRONICS|2867568870   |
|GROCERY    |2866272106   |
|FASHION    |2834182172   |
+-----------+-------------+



In [42]:
#36
csv_orders_df.groupBy("product").agg(F.sum("amount_int").alias("total_revenue")).show(20, truncate=False)

+-----------+-------------+
|product    |total_revenue|
+-----------+-------------+
|MOBILE     |944352576    |
|SHOES      |946799102    |
|MIXER      |957140026    |
|AIRPURIFIER|952178123    |
|JEANS      |951286127    |
|RICE       |954494237    |
|TSHIRT     |936096943    |
|TABLET     |960719999    |
|OIL        |963572869    |
|SUGAR      |948205000    |
|VACUUM     |959149427    |
|LAPTOP     |962496295    |
+-----------+-------------+



In [43]:
#37
csv_orders_df.groupBy("product").agg(F.sum("amount_int").alias("total_revenue")).orderBy(F.col("total_revenue").desc()) .show(10, truncate=False)

+-----------+-------------+
|product    |total_revenue|
+-----------+-------------+
|OIL        |963572869    |
|LAPTOP     |962496295    |
|TABLET     |960719999    |
|VACUUM     |959149427    |
|MIXER      |957140026    |
|RICE       |954494237    |
|AIRPURIFIER|952178123    |
|JEANS      |951286127    |
|SUGAR      |948205000    |
|SHOES      |946799102    |
+-----------+-------------+
only showing top 10 rows


In [44]:
#38
csv_orders_df.groupBy("city").agg(F.avg("amount_int").alias("avg_order_value")).show(20, truncate=False)

+---------+------------------+
|city     |avg_order_value   |
+---------+------------------+
|KOLKATA  |43709.816662630175|
|BANGALORE|44098.867908689645|
|DELHI    |43817.20780331374 |
|HYDERABAD|43708.74045293664 |
|CHENNAI  |43628.27900315863 |
|PUNE     |43930.204013556424|
|MUMBAI   |43723.75651612556 |
+---------+------------------+



PHASE 9 — WINDOW FUNCTIONS (BIG DATA
SAFE)

Exercises

39. Rank cities by total revenue
40. Rank products within each category by revenue
41. Identify the top product per category
42. Identify top 3 cities using window functions

In [45]:
#39
city_window = Window.orderBy(F.col("total_revenue").desc())
city_revenue_df.withColumn("rank", F.rank().over(city_window)).show(20, truncate=False)

+---------+-------------+----+
|city     |total_revenue|rank|
+---------+-------------+----+
|PUNE     |1646196535   |1   |
|HYDERABAD|1642443340   |2   |
|DELHI    |1639639916   |3   |
|CHENNAI  |1629865247   |4   |
|BANGALORE|1628527093   |5   |
|MUMBAI   |1625518096   |6   |
|KOLKATA  |1624300497   |7   |
+---------+-------------+----+



In [46]:
#40
product_revenue_df = csv_orders_df.groupBy("category", "product").agg(F.sum("amount_int").alias("revenue"))
category_window = Window.partitionBy("category").orderBy(F.col("revenue").desc())
product_revenue_df.withColumn("rank", F.rank().over(category_window)).show(20, truncate=False)

+-----------+-----------+---------+----+
|category   |product    |revenue  |rank|
+-----------+-----------+---------+----+
|ELECTRONICS|LAPTOP     |962496295|1   |
|ELECTRONICS|TABLET     |960719999|2   |
|ELECTRONICS|MOBILE     |944352576|3   |
|FASHION    |JEANS      |951286127|1   |
|FASHION    |SHOES      |946799102|2   |
|FASHION    |TSHIRT     |936096943|3   |
|GROCERY    |OIL        |963572869|1   |
|GROCERY    |RICE       |954494237|2   |
|GROCERY    |SUGAR      |948205000|3   |
|HOME       |VACUUM     |959149427|1   |
|HOME       |MIXER      |957140026|2   |
|HOME       |AIRPURIFIER|952178123|3   |
+-----------+-----------+---------+----+



In [47]:
#41
product_revenue_df.withColumn("rank", F.rank().over(category_window)) .filter(F.col("rank") == 1).show(20, truncate=False)

+-----------+-------+---------+----+
|category   |product|revenue  |rank|
+-----------+-------+---------+----+
|ELECTRONICS|LAPTOP |962496295|1   |
|FASHION    |JEANS  |951286127|1   |
|GROCERY    |OIL    |963572869|1   |
|HOME       |VACUUM |959149427|1   |
+-----------+-------+---------+----+



In [48]:
#42
city_revenue_df.withColumn("rank", F.rank().over(city_window)).filter(F.col("rank") <= 3).show()

+---------+-------------+----+
|     city|total_revenue|rank|
+---------+-------------+----+
|     PUNE|   1646196535|   1|
|HYDERABAD|   1642443340|   2|
|    DELHI|   1639639916|   3|
+---------+-------------+----+



PHASE 10 — CACHING & REUSE

Exercises

43. Identify DataFrames reused multiple times
44. Apply caching strategically
45. Re-run analytics and observe performance
46. Unpersist when cache is no longer needed
47. Explain why over-caching is dangerous

In [None]:
#43
#csv_orders_df is reused in multiple aggregations

In [49]:
#44
csv_orders_df.cache()

DataFrame[order_id: string, customer_id: string, city: string, category: string, product: string, amount: string, order_date: string, status: string, amount_clean: string, amount_int: int, order_date_clean: date]

In [50]:
#45
csv_orders_df.groupBy("city").agg(F.sum("amount_int").alias("total_revenue")).show()

+---------+-------------+
|     city|total_revenue|
+---------+-------------+
|  KOLKATA|   1624300497|
|BANGALORE|   1628527093|
|    DELHI|   1639639916|
|HYDERABAD|   1642443340|
|  CHENNAI|   1629865247|
|     PUNE|   1646196535|
|   MUMBAI|   1625518096|
+---------+-------------+



In [51]:
#46
csv_orders_df.unpersist()

DataFrame[order_id: string, customer_id: string, city: string, category: string, product: string, amount: string, order_date: string, status: string, amount_clean: string, amount_int: int, order_date_clean: date]

In [52]:
#47
#consumes executor memory
#causes disk spill
#slows other spark jobs
#can crash executions

PHASE 11 — FILE FORMAT STRATEGY

Exercises

48. Write the cleaned order-level dataset to Parquet
49. Partition the Parquet output by city
50. Write aggregated analytics to ORC
51. Read both formats back and validate schema
52. Compare number of output files generated

In [53]:
#48
csv_orders_df.write.mode("overwrite").parquet("/content/orders_parquet")

In [54]:
#49
csv_orders_df.write.mode("overwrite").partitionBy("city").parquet("/content/orders_parquet_city")

In [55]:
#50
city_revenue_df.write.mode("overwrite").orc("/content/city_revenue_orc")

In [56]:
#51
spark.read.parquet("/content/orders_parquet_city").printSchema()
spark.read.orc("/content/city_revenue_orc").printSchema()

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- category: string (nullable = true)
 |-- product: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- order_date: string (nullable = true)
 |-- status: string (nullable = true)
 |-- amount_clean: string (nullable = true)
 |-- amount_int: integer (nullable = true)
 |-- order_date_clean: date (nullable = true)
 |-- city: string (nullable = true)

root
 |-- city: string (nullable = true)
 |-- total_revenue: long (nullable = true)



In [57]:
#52
spark.read.parquet("/content/orders_parquet_city").rdd.getNumPartitions()

2

PHASE 12 — DEBUGGING & FAILURE
SCENARIOS

Exercises

53. Explain why the following line breaks pipelines:

df = df.filter(df.amount > 50000).show()

54. Create a scenario that produces a NoneType error
55. Identify a transformation that causes a wide shuffle
56. Explain how you would debug a slow Spark job

In [58]:
#53
csv_orders_df = csv_orders_df.filter(csv_orders_df.amount > 50000).show()

{"ts": "2025-12-26 12:41:31.316", "level": "ERROR", "logger": "DataFrameQueryContextLogger", "msg": "[CAST_INVALID_INPUT] The value '12,000' of the type \"STRING\" cannot be cast to \"BIGINT\" because it is malformed. Correct the value as per the syntax, or change its target type. Use `try_cast` to tolerate malformed input and return NULL instead. SQLSTATE: 22018", "context": {"file": "line 2 in cell [58]", "line": "", "fragment": "__gt__", "errorClass": "CAST_INVALID_INPUT"}, "exception": {"class": "Py4JJavaError", "msg": "An error occurred while calling o490.showString.\n: org.apache.spark.SparkNumberFormatException: [CAST_INVALID_INPUT] The value '12,000' of the type \"STRING\" cannot be cast to \"BIGINT\" because it is malformed. Correct the value as per the syntax, or change its target type. Use `try_cast` to tolerate malformed input and return NULL instead. SQLSTATE: 22018\n== DataFrame ==\n\"__gt__\" was called from\nline 2 in cell [58]\n\n\tat org.apache.spark.sql.errors.QueryE

NumberFormatException: [CAST_INVALID_INPUT] The value '12,000' of the type "STRING" cannot be cast to "BIGINT" because it is malformed. Correct the value as per the syntax, or change its target type. Use `try_cast` to tolerate malformed input and return NULL instead. SQLSTATE: 22018
== DataFrame ==
"__gt__" was called from
line 2 in cell [58]


In [59]:
#54
df = csv_orders_df.show()
df.count()

+-----------+-----------+-------+-----------+-----------+-------+----------+---------+------------+----------+----------------+
|   order_id|customer_id|   city|   category|    product| amount|order_date|   status|amount_clean|amount_int|order_date_clean|
+-----------+-----------+-------+-----------+-----------+-------+----------+---------+------------+----------+----------------+
|ORD00000011|    C000011|KOLKATA|ELECTRONICS|     TABLET|  50318|12/01/2024|COMPLETED|       50318|     50318|      2024-01-12|
|ORD00000036|    C000036|KOLKATA|    GROCERY|        OIL|  29253|2024-02-06|COMPLETED|       29253|     29253|      2024-02-06|
|ORD00000048|    C000048|KOLKATA|    FASHION|      JEANS|  51000|2024-02-18|COMPLETED|       51000|     51000|      2024-02-18|
|ORD00000054|    C000054|KOLKATA|    GROCERY|        OIL|  26434|2024-02-24|COMPLETED|       26434|     26434|      2024-02-24|
|ORD00000104|    C000104|KOLKATA|    FASHION|      JEANS|  32476|2024/02/14|COMPLETED|       32476|     

AttributeError: 'NoneType' object has no attribute 'count'

In [None]:
#55
#causes wide shuffle
#groupBy(), join(), distinct(), orderBy()

In [None]:
#56
#explain(True)
#Spark UI
#Check skew
#Optimize partitions
#Broadcast joins

PHASE 13 — FINAL VALIDATION

Exercises

57. Validate no nulls in critical columns
58. Confirm correct data types for all columns
59. Validate final record count
60. Document three optimization decisions you made

In [60]:
#57
csv_orders_df.filter(
    F.col("order_id").isNull() |
    F.col("amount_int").isNull() |
    F.col("order_date_clean").isNull()
).count()

26166

In [61]:
#58
csv_orders_df.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- city: string (nullable = true)
 |-- category: string (nullable = true)
 |-- product: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- order_date: string (nullable = true)
 |-- status: string (nullable = true)
 |-- amount_clean: string (nullable = true)
 |-- amount_int: integer (nullable = true)
 |-- order_date_clean: date (nullable = true)



In [62]:
#59
csv_orders_df.count()

285000

In [None]:
#60
#Explicit schema enforcement
#Repartition by city
#Strategic caching after data cleaning