In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

In [2]:
spark = SparkSession.builder.appName("Second Milestone").getOrCreate()

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
csv_path = "/content/drive/MyDrive/Colab Notebooks/orders_raw.csv"

Section 1 — Data Ingestion & Schema

1. Read the CSV file using PySpark ensuring the job does not fail due to bad data.
2. Explain why reading all columns as StringType is preferred initially.
3. Print schema and total record count.

In [6]:
#1,3
df_csv_raw = (spark.read.option("header", "True")
              .option("inferSchema", "False").csv(csv_path))
df_csv_raw.printSchema()
df_csv_raw.count()

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- city: string (nullable = true)
 |-- category: string (nullable = true)
 |-- product: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- order_date: string (nullable = true)
 |-- status: string (nullable = true)



300000

In [7]:
#2
#Reading all columns as StringType initially :
#1 Prevents job failure due to malformed data
#2 Supports multiple date formats
#3 Allows custom validation and error handling

Section 2 — Data Cleaning & Validation

4. Clean leading/trailing spaces from string columns.
5. Standardize city , category , and product values.
6. Convert amount to integer safely, handling invalid values.
7. Parse order_date supporting multiple date formats.
8. Identify and handle invalid or null records.

In [8]:
#4
trimmed_df = df_csv_raw.select([trim(col(c)).alias(c) for c in df_csv_raw.columns])
trimmed_df.show()

+-----------+-----------+---------+-----------+-----------+-------+----------+---------+
|   order_id|customer_id|     city|   category|    product| amount|order_date|   status|
+-----------+-----------+---------+-----------+-----------+-------+----------+---------+
|ORD00000000|    C000000|hyderabad|    grocery|        Oil|invalid|01/01/2024|Cancelled|
|ORD00000001|    C000001|     Pune|    Grocery|      Sugar|  35430|2024-01-02|Completed|
|ORD00000002|    C000002|     Pune|Electronics|     Mobile|  65358|2024-01-03|Completed|
|ORD00000003|    C000003|Bangalore|Electronics|     Laptop|   5558|2024-01-04|Completed|
|ORD00000004|    C000004|     Pune|       Home|AirPurifier|  33659|2024-01-05|Completed|
|ORD00000005|    C000005|    Delhi|    Fashion|      Jeans|   8521|2024-01-06|Completed|
|ORD00000006|    C000006|    Delhi|    Grocery|      Sugar|  42383|2024-01-07|Completed|
|ORD00000007|    C000007|     Pune|    Grocery|       Rice|  45362|2024-01-08|Completed|
|ORD00000008|    C000

In [9]:
#5
std_df = (
    trimmed_df
    .withColumn("city", initcap(lower(col("city"))))
    .withColumn("category", initcap(lower(col("category"))))
    .withColumn("product", initcap(lower(col("product"))))
)
std_df.show()

+-----------+-----------+---------+-----------+-----------+-------+----------+---------+
|   order_id|customer_id|     city|   category|    product| amount|order_date|   status|
+-----------+-----------+---------+-----------+-----------+-------+----------+---------+
|ORD00000000|    C000000|Hyderabad|    Grocery|        Oil|invalid|01/01/2024|Cancelled|
|ORD00000001|    C000001|     Pune|    Grocery|      Sugar|  35430|2024-01-02|Completed|
|ORD00000002|    C000002|     Pune|Electronics|     Mobile|  65358|2024-01-03|Completed|
|ORD00000003|    C000003|Bangalore|Electronics|     Laptop|   5558|2024-01-04|Completed|
|ORD00000004|    C000004|     Pune|       Home|Airpurifier|  33659|2024-01-05|Completed|
|ORD00000005|    C000005|    Delhi|    Fashion|      Jeans|   8521|2024-01-06|Completed|
|ORD00000006|    C000006|    Delhi|    Grocery|      Sugar|  42383|2024-01-07|Completed|
|ORD00000007|    C000007|     Pune|    Grocery|       Rice|  45362|2024-01-08|Completed|
|ORD00000008|    C000

In [11]:
#6
df_clean = std_df.withColumn("amount",when(col("amount").rlike("^[0-9]+$"), col("amount").cast(IntegerType())))
df_clean.show()

+-----------+-----------+---------+-----------+-----------+------+----------+---------+
|   order_id|customer_id|     city|   category|    product|amount|order_date|   status|
+-----------+-----------+---------+-----------+-----------+------+----------+---------+
|ORD00000000|    C000000|Hyderabad|    Grocery|        Oil|  NULL|01/01/2024|Cancelled|
|ORD00000001|    C000001|     Pune|    Grocery|      Sugar| 35430|2024-01-02|Completed|
|ORD00000002|    C000002|     Pune|Electronics|     Mobile| 65358|2024-01-03|Completed|
|ORD00000003|    C000003|Bangalore|Electronics|     Laptop|  5558|2024-01-04|Completed|
|ORD00000004|    C000004|     Pune|       Home|Airpurifier| 33659|2024-01-05|Completed|
|ORD00000005|    C000005|    Delhi|    Fashion|      Jeans|  8521|2024-01-06|Completed|
|ORD00000006|    C000006|    Delhi|    Grocery|      Sugar| 42383|2024-01-07|Completed|
|ORD00000007|    C000007|     Pune|    Grocery|       Rice| 45362|2024-01-08|Completed|
|ORD00000008|    C000008|Bangalo

In [12]:
#7
df_clean=df_clean.withColumn(
    "order_date",
    coalesce(
        try_to_timestamp(col("order_date"), lit("yyyy-MM-dd")),
        try_to_timestamp(col("order_date"), lit("dd-MM-yyyy")),
        try_to_timestamp(col("order_date"), lit("MM-dd-yyyy")),
        try_to_timestamp(col("order_date"), lit("dd/MM/yyyy")),
        try_to_timestamp(col("order_date"), lit("MM/dd/yyyy")),
        try_to_timestamp(col("order_date"), lit("yyyy/MM/dd"))
    ).cast(DateType())
)
df_clean.show()

+-----------+-----------+---------+-----------+-----------+------+----------+---------+
|   order_id|customer_id|     city|   category|    product|amount|order_date|   status|
+-----------+-----------+---------+-----------+-----------+------+----------+---------+
|ORD00000000|    C000000|Hyderabad|    Grocery|        Oil|  NULL|2024-01-01|Cancelled|
|ORD00000001|    C000001|     Pune|    Grocery|      Sugar| 35430|2024-01-02|Completed|
|ORD00000002|    C000002|     Pune|Electronics|     Mobile| 65358|2024-01-03|Completed|
|ORD00000003|    C000003|Bangalore|Electronics|     Laptop|  5558|2024-01-04|Completed|
|ORD00000004|    C000004|     Pune|       Home|Airpurifier| 33659|2024-01-05|Completed|
|ORD00000005|    C000005|    Delhi|    Fashion|      Jeans|  8521|2024-01-06|Completed|
|ORD00000006|    C000006|    Delhi|    Grocery|      Sugar| 42383|2024-01-07|Completed|
|ORD00000007|    C000007|     Pune|    Grocery|       Rice| 45362|2024-01-08|Completed|
|ORD00000008|    C000008|Bangalo

In [14]:
#8
valid_df = df_clean.filter(
    (col("order_id").isNotNull()) &
    (col("order_date").isNotNull())&
    (col("amount").isNotNull())
)
valid_df.show()

+-----------+-----------+---------+-----------+-----------+------+----------+---------+
|   order_id|customer_id|     city|   category|    product|amount|order_date|   status|
+-----------+-----------+---------+-----------+-----------+------+----------+---------+
|ORD00000001|    C000001|     Pune|    Grocery|      Sugar| 35430|2024-01-02|Completed|
|ORD00000002|    C000002|     Pune|Electronics|     Mobile| 65358|2024-01-03|Completed|
|ORD00000003|    C000003|Bangalore|Electronics|     Laptop|  5558|2024-01-04|Completed|
|ORD00000004|    C000004|     Pune|       Home|Airpurifier| 33659|2024-01-05|Completed|
|ORD00000005|    C000005|    Delhi|    Fashion|      Jeans|  8521|2024-01-06|Completed|
|ORD00000006|    C000006|    Delhi|    Grocery|      Sugar| 42383|2024-01-07|Completed|
|ORD00000007|    C000007|     Pune|    Grocery|       Rice| 45362|2024-01-08|Completed|
|ORD00000008|    C000008|Bangalore|    Fashion|      Jeans| 10563|2024-01-09|Completed|
|ORD00000009|    C000009|  Kolka

Section 3 — Business Rules

9. Remove duplicate records based on order_id .
10. Filter only records with status = Completed .
11. Validate record counts before and after ltering.

In [18]:
#9
distinct_df = valid_df.dropDuplicates(["order_id"])
#distinct_df.show()
#10
filtered_df = distinct_df.filter(col("status") == "Completed")
filtered_df.show()


+-----------+-----------+---------+-----------+-----------+------+----------+---------+
|   order_id|customer_id|     city|   category|    product|amount|order_date|   status|
+-----------+-----------+---------+-----------+-----------+------+----------+---------+
|ORD00000001|    C000001|     Pune|    Grocery|      Sugar| 35430|2024-01-02|Completed|
|ORD00000007|    C000007|     Pune|    Grocery|       Rice| 45362|2024-01-08|Completed|
|ORD00000008|    C000008|Bangalore|    Fashion|      Jeans| 10563|2024-01-09|Completed|
|ORD00000010|    C000010|Bangalore|    Grocery|      Sugar| 66576|2024-01-11|Completed|
|ORD00000011|    C000011|  Kolkata|Electronics|     Tablet| 50318|2024-01-12|Completed|
|ORD00000012|    C000012|Bangalore|    Grocery|      Sugar| 84768|2024-01-13|Completed|
|ORD00000014|    C000014|   Mumbai|Electronics|     Tablet| 79469|2024-01-15|Completed|
|ORD00000015|    C000015|     Pune|Electronics|     Mobile| 81018|2024-01-16|Completed|
|ORD00000017|    C000017|Bangalo

In [19]:
#11
print("Record count before filtering:", valid_df.count())
print("Record count after filtering:", filtered_df.count())

Record count before filtering: 260206
Record count after filtering: 247194


Section 4 — Performance & Optimization

12. Identify operations that cause shuffles.
13. Use explain(True) to analyze the execution plan.
14. Apply repartitioning to optimize aggregations.
15. Justify where caching should be applied and why.

In [20]:
#12
#operations that cause shuffles:
#groupBy,orderBy
#join

In [21]:
#13
filtered_df.explain(True)

== Parsed Logical Plan ==
'Filter '`=`('status, Completed)
+- Deduplicate [order_id#37]
   +- Filter ((isnotnull(order_id#37) AND isnotnull(order_date#183)) AND isnotnull(amount#149))
      +- Project [order_id#37, customer_id#38, city#79, category#80, product#81, amount#149, cast(coalesce(try_to_timestamp(order_date#43, Some(yyyy-MM-dd), TimestampType, Some(Etc/UTC), false), try_to_timestamp(order_date#43, Some(dd-MM-yyyy), TimestampType, Some(Etc/UTC), false), try_to_timestamp(order_date#43, Some(MM-dd-yyyy), TimestampType, Some(Etc/UTC), false), try_to_timestamp(order_date#43, Some(dd/MM/yyyy), TimestampType, Some(Etc/UTC), false), try_to_timestamp(order_date#43, Some(MM/dd/yyyy), TimestampType, Some(Etc/UTC), false), try_to_timestamp(order_date#43, Some(yyyy/MM/dd), TimestampType, Some(Etc/UTC), false)) as date) AS order_date#183, status#44]
         +- Project [order_id#37, customer_id#38, city#79, category#80, product#81, CASE WHEN RLIKE(amount#42, ^[0-9]+$) THEN cast(amount#42 a

In [23]:
#14
repartitioned_df = filtered_df.repartition("city")
repartitioned_df.show()

+-----------+-----------+---------+-----------+-----------+------+----------+---------+
|   order_id|customer_id|     city|   category|    product|amount|order_date|   status|
+-----------+-----------+---------+-----------+-----------+------+----------+---------+
|ORD00000008|    C000008|Bangalore|    Fashion|      Jeans| 10563|2024-01-09|Completed|
|ORD00000010|    C000010|Bangalore|    Grocery|      Sugar| 66576|2024-01-11|Completed|
|ORD00000012|    C000012|Bangalore|    Grocery|      Sugar| 84768|2024-01-13|Completed|
|ORD00000017|    C000017|Bangalore|    Grocery|        Oil| 69582|2024-01-18|Completed|
|ORD00000024|    C000024|Bangalore|       Home|      Mixer| 18082|2024-01-25|Completed|
|ORD00000025|    C000025|Bangalore|       Home|Airpurifier| 58248|2024-01-26|Completed|
|ORD00000124|    C000124|Bangalore|    Grocery|      Sugar| 54296|2024-01-05|Completed|
|ORD00000159|    C000159|Bangalore|Electronics|     Tablet| 89397|2024-02-09|Completed|
|ORD00000196|    C000196|Bangalo

In [None]:
#15
#filtered_df should be cached because it is used multiple times and avoids recomputation

Section 5 — Analytics

16. Calculate total revenue per city.
17. Calculate total revenue per category.
18. Identify top 5 products by revenue.
19. Calculate average order value per city.

In [28]:
#16
city_rev = filtered_df.groupBy("city").agg(sum("amount").alias("total_revenue"))
city_rev.show()

#17
category_rev = filtered_df.groupBy("category").agg(sum("amount").alias("total_revenue"))
category_rev.show()

#18
product_rev = filtered_df.groupBy("product").agg(sum("amount").alias("revenue")).orderBy(col("revenue").desc())
top_prod = product_rev.limit(5)
top_prod.show()

#19
city_avg = filtered_df.groupBy("city").agg(avg("amount").alias("avg_order_value"))
city_avg.show()

+---------+-------------+
|     city|total_revenue|
+---------+-------------+
|Bangalore|   1595074422|
|  Chennai|   1594948816|
|   Mumbai|   1592800385|
|  Kolkata|   1589940642|
|     Pune|   1611282681|
|    Delhi|   1602665640|
|Hyderabad|   1609240412|
+---------+-------------+

+-----------+-------------+
|   category|total_revenue|
+-----------+-------------+
|       Home|   2808394769|
|    Fashion|   2774418007|
|    Grocery|   2806744668|
|Electronics|   2806395554|
+-----------+-------------+

+-------+---------+
|product|  revenue|
+-------+---------+
|    Oil|943582283|
| Laptop|943169707|
| Tablet|939977603|
| Vacuum|939615394|
|  Mixer|937101483|
+-------+---------+

+---------+------------------+
|     city|   avg_order_value|
+---------+------------------+
|Bangalore| 45573.55491428571|
|  Chennai| 45094.54086912268|
|   Mumbai|45190.954576405835|
|  Kolkata| 45229.15944585099|
|     Pune|45417.670067931336|
|    Delhi|45331.946597273294|
|Hyderabad|   45209.73204101

Section 6 — Window Functions

20. Rank cities by total revenue.
21. Rank products within each category by revenue.
22. Identify the top product per category.

In [29]:
#20
city_window = Window.orderBy(col("total_revenue").desc())
city_rank = city_rev.withColumn("rank", rank().over(city_window))
city_rank.show()

+---------+-------------+----+
|     city|total_revenue|rank|
+---------+-------------+----+
|     Pune|   1611282681|   1|
|Hyderabad|   1609240412|   2|
|    Delhi|   1602665640|   3|
|Bangalore|   1595074422|   4|
|  Chennai|   1594948816|   5|
|   Mumbai|   1592800385|   6|
|  Kolkata|   1589940642|   7|
+---------+-------------+----+



In [31]:
#21
cat_window = Window.partitionBy("category").orderBy(col("revenue").desc())
product_rank = filtered_df.groupBy("category","product").agg(sum("amount").alias("revenue")).withColumn("rank", rank().over(cat_window))
product_rank.show()

+-----------+-----------+---------+----+
|   category|    product|  revenue|rank|
+-----------+-----------+---------+----+
|Electronics|     Laptop|943169707|   1|
|Electronics|     Tablet|939977603|   2|
|Electronics|     Mobile|923248244|   3|
|    Fashion|      Jeans|930910809|   1|
|    Fashion|      Shoes|926552299|   2|
|    Fashion|     Tshirt|916954899|   3|
|    Grocery|        Oil|943582283|   1|
|    Grocery|       Rice|934334225|   2|
|    Grocery|      Sugar|928828160|   3|
|       Home|     Vacuum|939615394|   1|
|       Home|      Mixer|937101483|   2|
|       Home|Airpurifier|931677892|   3|
+-----------+-----------+---------+----+



In [34]:
#22
top_product = product_rank.filter(col("rank") == 1).drop("rank")
top_product.show()

+-----------+-------+---------+
|   category|product|  revenue|
+-----------+-------+---------+
|Electronics| Laptop|943169707|
|    Fashion|  Jeans|930910809|
|    Grocery|    Oil|943582283|
|       Home| Vacuum|939615394|
+-----------+-------+---------+



Section 7 — Storage Strategy

23. Write the cleaned dataset to Parquet partitioned by city .
24. Write aggregated analytics to ORC
25. Explain why CSV is not suitable for analytics output

In [36]:
#23
filtered_df.write.mode("overwrite").partitionBy("city").parquet("/data/orders_parquet")

In [37]:
#24
city_rev.write.mode("overwrite").orc("/data/city_revenue_orc")

In [None]:
#25
#CSV is not suitable for analytics because of :
#lack of schema enforcement
#it is not compressed
#has slow I/O

Section 8 — Debugging & Reasoning

26. Explain why the following line causes failure:
df = df.filter(df.amount > 50000).show()
27. Describe how you would debug a slow Spark job.
28. Identify risks of over-caching DataFrames

In [None]:
#26
df = df.filter(df.amount > 50000).show()
#.show() returns None
#Assigning df with .show() overwrites it with None therefore pipline breaks

#correct way
df.filter(df.amount > 50000).show()

In [None]:
#27
#To Debug a slow Spark job
#Reduce Shuffles
#Use explain(True)
#Cache effectively

In [None]:
#28
#Risk of over-caching are :
#Slower jobs due to spills
#Memory pressure