In [42]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

spark=SparkSession.builder.appName("Online Market").getOrCreate()

PHASE 1 — SCHEMA & INGESTION

Tasks
1. Define an explicit schema
2. Create a DataFrame using the schema
3. Print and verify schema

In [43]:
orders_data = [
    ("ORD001","C001","Delhi ","Electronics","Laptop","45000","2024-01-05","Completed"),
    ("ORD002","C002","Mumbai","Electronics","Mobile ","32000","05/01/2024","Completed"),
    ("ORD003","C003","Bangalore","Electronics","Tablet","30000","2024/01/06","Completed"),
    ("ORD004","C004","Delhi","Electronics","Laptop","","2024-01-07","Cancelled"),
    ("ORD005","C005","Chennai","Electronics","Mobile","invalid","2024-01-08","Completed"),
    ("ORD006","C006","Mumbai","Home","Mixer",None,"2024-01-08","Completed"),
    ("ORD007","C001","Delhi","Electronics","Laptop","47000","09-01-2024","Completed"),
    ("ORD008","C007","Bangalore","Home","Vacuum","28000","2024-01-09","Completed"),
    ("ORD009","C002","Mumbai","Electronics","Laptop","55000","2024-01-10","Completed"),
    ("ORD010","C008","Delhi","Home","AirPurifier","38000","2024-01-10","Completed"),
    ("ORD011","C009","Mumbai","Home","Vacuum","29000","2024-01-11","Completed"),
    ("ORD012","C010","Bangalore","Electronics","Mobile","33000","2024-01-11","Completed"),
    ("ORD013","C003","Bangalore","Home","Mixer","21000","2024-01-12","Completed"),
    ("ORD014","C004","Delhi","Electronics","Tablet","26000","2024-01-12","Completed"),
    ("ORD015","C005","Chennai","Electronics","Laptop","62000","2024-01-13","Completed"),
    ("ORD016","C006","Mumbai","Home","AirPurifier","40000","2024-01-13","Completed"),
    ("ORD017","C007","Bangalore","Electronics","Laptop","51000","2024-01-14","Completed"),
    ("ORD018","C008","Delhi","Home","Vacuum","31000","2024-01-14","Completed"),
    ("ORD019","C009","Mumbai","Electronics","Tablet","29000","2024-01-15","Completed"),
    ("ORD020","C010","Bangalore","Electronics","Laptop","54000","2024-01-15","Completed"),
    ("ORD020","C010","Bangalore","Electronics","Laptop","54000","2024-01-15","Completed")
]
columns = ["order_id","customer_id","city","category","product","amount","order_date","status"]
orders_df = spark.createDataFrame(orders_data,columns)
orders_df.show()
orders_df.printSchema()

+--------+-----------+---------+-----------+-----------+-------+----------+---------+
|order_id|customer_id|     city|   category|    product| amount|order_date|   status|
+--------+-----------+---------+-----------+-----------+-------+----------+---------+
|  ORD001|       C001|   Delhi |Electronics|     Laptop|  45000|2024-01-05|Completed|
|  ORD002|       C002|   Mumbai|Electronics|    Mobile |  32000|05/01/2024|Completed|
|  ORD003|       C003|Bangalore|Electronics|     Tablet|  30000|2024/01/06|Completed|
|  ORD004|       C004|    Delhi|Electronics|     Laptop|       |2024-01-07|Cancelled|
|  ORD005|       C005|  Chennai|Electronics|     Mobile|invalid|2024-01-08|Completed|
|  ORD006|       C006|   Mumbai|       Home|      Mixer|   NULL|2024-01-08|Completed|
|  ORD007|       C001|    Delhi|Electronics|     Laptop|  47000|09-01-2024|Completed|
|  ORD008|       C007|Bangalore|       Home|     Vacuum|  28000|2024-01-09|Completed|
|  ORD009|       C002|   Mumbai|Electronics|     Lapto

PHASE 2 — DATA CLEANING

Tasks

4. Trim all string columns
5. Standardize city , category , and product values
6. Convert amount to IntegerType
7. Handle invalid, empty, and null amount values
8. Convert order_date into DateType (handle multiple formats)
9. Remove duplicate order_id records
10. Keep only Completed orders

In [53]:
#4
for c in orders_df.columns:
    orders_df = orders_df.withColumn(c, trim(col(c)))
orders_df.show()

+--------+-----------+---------+-----------+-----------+------+----------+---------+
|order_id|customer_id|     city|   category|    product|amount|order_date|   status|
+--------+-----------+---------+-----------+-----------+------+----------+---------+
|  ORD001|       C001|    Delhi|Electronics|     Laptop| 45000|2024-01-05|Completed|
|  ORD002|       C002|   Mumbai|Electronics|     Mobile| 32000|2024-01-05|Completed|
|  ORD003|       C003|Bangalore|Electronics|     Tablet| 30000|2024-01-06|Completed|
|  ORD007|       C001|    Delhi|Electronics|     Laptop| 47000|2024-01-09|Completed|
|  ORD008|       C007|Bangalore|       Home|     Vacuum| 28000|2024-01-09|Completed|
|  ORD009|       C002|   Mumbai|Electronics|     Laptop| 55000|2024-01-10|Completed|
|  ORD010|       C008|    Delhi|       Home|Airpurifier| 38000|2024-01-10|Completed|
|  ORD011|       C009|   Mumbai|       Home|     Vacuum| 29000|2024-01-11|Completed|
|  ORD012|       C010|Bangalore|Electronics|     Mobile| 33000|20

In [54]:
#5
df_clean = orders_df.withColumn("city", initcap(lower(col("city")))).withColumn("category", initcap(lower(col("category"))))\
.withColumn("product", initcap(lower(col("product"))))
df_clean.show()

+--------+-----------+---------+-----------+-----------+------+----------+---------+
|order_id|customer_id|     city|   category|    product|amount|order_date|   status|
+--------+-----------+---------+-----------+-----------+------+----------+---------+
|  ORD001|       C001|    Delhi|Electronics|     Laptop| 45000|2024-01-05|Completed|
|  ORD002|       C002|   Mumbai|Electronics|     Mobile| 32000|2024-01-05|Completed|
|  ORD003|       C003|Bangalore|Electronics|     Tablet| 30000|2024-01-06|Completed|
|  ORD007|       C001|    Delhi|Electronics|     Laptop| 47000|2024-01-09|Completed|
|  ORD008|       C007|Bangalore|       Home|     Vacuum| 28000|2024-01-09|Completed|
|  ORD009|       C002|   Mumbai|Electronics|     Laptop| 55000|2024-01-10|Completed|
|  ORD010|       C008|    Delhi|       Home|Airpurifier| 38000|2024-01-10|Completed|
|  ORD011|       C009|   Mumbai|       Home|     Vacuum| 29000|2024-01-11|Completed|
|  ORD012|       C010|Bangalore|Electronics|     Mobile| 33000|20

In [55]:
#6
df_clean = df_clean.withColumn("amount",when(col("amount").rlike("^[0-9]+$"), col("amount").cast(IntegerType())))
df_clean.show()

+--------+-----------+---------+-----------+-----------+------+----------+---------+
|order_id|customer_id|     city|   category|    product|amount|order_date|   status|
+--------+-----------+---------+-----------+-----------+------+----------+---------+
|  ORD001|       C001|    Delhi|Electronics|     Laptop| 45000|2024-01-05|Completed|
|  ORD002|       C002|   Mumbai|Electronics|     Mobile| 32000|2024-01-05|Completed|
|  ORD003|       C003|Bangalore|Electronics|     Tablet| 30000|2024-01-06|Completed|
|  ORD007|       C001|    Delhi|Electronics|     Laptop| 47000|2024-01-09|Completed|
|  ORD008|       C007|Bangalore|       Home|     Vacuum| 28000|2024-01-09|Completed|
|  ORD009|       C002|   Mumbai|Electronics|     Laptop| 55000|2024-01-10|Completed|
|  ORD010|       C008|    Delhi|       Home|Airpurifier| 38000|2024-01-10|Completed|
|  ORD011|       C009|   Mumbai|       Home|     Vacuum| 29000|2024-01-11|Completed|
|  ORD012|       C010|Bangalore|Electronics|     Mobile| 33000|20

In [56]:
#7
df_clean = df_clean.filter(col("amount").isNotNull())
df_clean.show()

+--------+-----------+---------+-----------+-----------+------+----------+---------+
|order_id|customer_id|     city|   category|    product|amount|order_date|   status|
+--------+-----------+---------+-----------+-----------+------+----------+---------+
|  ORD001|       C001|    Delhi|Electronics|     Laptop| 45000|2024-01-05|Completed|
|  ORD002|       C002|   Mumbai|Electronics|     Mobile| 32000|2024-01-05|Completed|
|  ORD003|       C003|Bangalore|Electronics|     Tablet| 30000|2024-01-06|Completed|
|  ORD007|       C001|    Delhi|Electronics|     Laptop| 47000|2024-01-09|Completed|
|  ORD008|       C007|Bangalore|       Home|     Vacuum| 28000|2024-01-09|Completed|
|  ORD009|       C002|   Mumbai|Electronics|     Laptop| 55000|2024-01-10|Completed|
|  ORD010|       C008|    Delhi|       Home|Airpurifier| 38000|2024-01-10|Completed|
|  ORD011|       C009|   Mumbai|       Home|     Vacuum| 29000|2024-01-11|Completed|
|  ORD012|       C010|Bangalore|Electronics|     Mobile| 33000|20

In [57]:
df_clean=df_clean.withColumn(
    "order_date",
    coalesce(
        try_to_timestamp(col("order_date"), lit("yyyy-MM-dd")),
        try_to_timestamp(col("order_date"), lit("dd-MM-yyyy")),
        try_to_timestamp(col("order_date"), lit("MM-dd-yyyy")),
        try_to_timestamp(col("order_date"), lit("dd/MM/yyyy")),
        try_to_timestamp(col("order_date"), lit("MM/dd/yyyy")),
        try_to_timestamp(col("order_date"), lit("yyyy/MM/dd"))
    ).cast(DateType())
)
df_clean.show()

+--------+-----------+---------+-----------+-----------+------+----------+---------+
|order_id|customer_id|     city|   category|    product|amount|order_date|   status|
+--------+-----------+---------+-----------+-----------+------+----------+---------+
|  ORD001|       C001|    Delhi|Electronics|     Laptop| 45000|2024-01-05|Completed|
|  ORD002|       C002|   Mumbai|Electronics|     Mobile| 32000|2024-01-05|Completed|
|  ORD003|       C003|Bangalore|Electronics|     Tablet| 30000|2024-01-06|Completed|
|  ORD007|       C001|    Delhi|Electronics|     Laptop| 47000|2024-01-09|Completed|
|  ORD008|       C007|Bangalore|       Home|     Vacuum| 28000|2024-01-09|Completed|
|  ORD009|       C002|   Mumbai|Electronics|     Laptop| 55000|2024-01-10|Completed|
|  ORD010|       C008|    Delhi|       Home|Airpurifier| 38000|2024-01-10|Completed|
|  ORD011|       C009|   Mumbai|       Home|     Vacuum| 29000|2024-01-11|Completed|
|  ORD012|       C010|Bangalore|Electronics|     Mobile| 33000|20

In [58]:
#9
df_clean=orders_df.dropDuplicates(["order_id"])
df_clean.show()

+--------+-----------+---------+-----------+-----------+------+----------+---------+
|order_id|customer_id|     city|   category|    product|amount|order_date|   status|
+--------+-----------+---------+-----------+-----------+------+----------+---------+
|  ORD001|       C001|    Delhi|Electronics|     Laptop| 45000|2024-01-05|Completed|
|  ORD002|       C002|   Mumbai|Electronics|     Mobile| 32000|2024-01-05|Completed|
|  ORD003|       C003|Bangalore|Electronics|     Tablet| 30000|2024-01-06|Completed|
|  ORD007|       C001|    Delhi|Electronics|     Laptop| 47000|2024-01-09|Completed|
|  ORD008|       C007|Bangalore|       Home|     Vacuum| 28000|2024-01-09|Completed|
|  ORD009|       C002|   Mumbai|Electronics|     Laptop| 55000|2024-01-10|Completed|
|  ORD010|       C008|    Delhi|       Home|Airpurifier| 38000|2024-01-10|Completed|
|  ORD011|       C009|   Mumbai|       Home|     Vacuum| 29000|2024-01-11|Completed|
|  ORD012|       C010|Bangalore|Electronics|     Mobile| 33000|20

In [59]:
#10
df_clean=df_clean.filter(col("status")=="Completed")
df_clean.show()

+--------+-----------+---------+-----------+-----------+------+----------+---------+
|order_id|customer_id|     city|   category|    product|amount|order_date|   status|
+--------+-----------+---------+-----------+-----------+------+----------+---------+
|  ORD001|       C001|    Delhi|Electronics|     Laptop| 45000|2024-01-05|Completed|
|  ORD002|       C002|   Mumbai|Electronics|     Mobile| 32000|2024-01-05|Completed|
|  ORD003|       C003|Bangalore|Electronics|     Tablet| 30000|2024-01-06|Completed|
|  ORD007|       C001|    Delhi|Electronics|     Laptop| 47000|2024-01-09|Completed|
|  ORD008|       C007|Bangalore|       Home|     Vacuum| 28000|2024-01-09|Completed|
|  ORD009|       C002|   Mumbai|Electronics|     Laptop| 55000|2024-01-10|Completed|
|  ORD010|       C008|    Delhi|       Home|Airpurifier| 38000|2024-01-10|Completed|
|  ORD011|       C009|   Mumbai|       Home|     Vacuum| 29000|2024-01-11|Completed|
|  ORD012|       C010|Bangalore|Electronics|     Mobile| 33000|20

PHASE 3 — DATA VALIDATION

Tasks

11. Count records before and after cleaning
12. Verify no nulls in order_id , amount , and order_date
13. Confirm correct data types

In [60]:
#11
print("Total records before cleaning:", orders_df.count())
print("Total records after cleaning:", df_clean.count())

Total records before cleaning: 18
Total records after cleaning: 17


In [61]:
#12
df_clean.select([count(when(col(c).isNull(),"*")).alias(c) for c in df_clean.columns]).show()

+--------+-----------+----+--------+-------+------+----------+------+
|order_id|customer_id|city|category|product|amount|order_date|status|
+--------+-----------+----+--------+-------+------+----------+------+
|       0|          0|   0|       0|      0|     0|         0|     0|
+--------+-----------+----+--------+-------+------+----------+------+



In [62]:
#13
df_clean.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- city: string (nullable = true)
 |-- category: string (nullable = true)
 |-- product: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- order_date: string (nullable = true)
 |-- status: string (nullable = true)



PHASE 4 — ANALYTICS & AGGREGATIONS

Tasks

14. Total revenue per city
15. Total revenue per category
16. Total revenue per product
17. Average order value per city
18. Identify top 3 products by revenue

In [63]:
#14
city_revenue=df_clean.groupBy("city").agg(sum("amount").alias("total_revenue"))
city_revenue.show()

+---------+-------------+
|     city|total_revenue|
+---------+-------------+
|Bangalore|     217000.0|
|  Chennai|      62000.0|
|   Mumbai|     185000.0|
|    Delhi|     187000.0|
+---------+-------------+



In [64]:
#15
category_revenue=df_clean.groupBy("category").agg(sum("amount").alias("total_revenue"))
category_revenue.show()

+-----------+-------------+
|   category|total_revenue|
+-----------+-------------+
|       Home|     187000.0|
|Electronics|     464000.0|
+-----------+-------------+



In [65]:
#16
product_revenue=df_clean.groupBy("product").agg(sum("amount").alias("total_revenue"))
product_revenue.show()

+-----------+-------------+
|    product|total_revenue|
+-----------+-------------+
|     Vacuum|      88000.0|
|Airpurifier|      78000.0|
|     Laptop|     314000.0|
|      Mixer|      21000.0|
|     Mobile|      65000.0|
|     Tablet|      85000.0|
+-----------+-------------+



In [66]:
#17
avg_order_city=df_clean.groupBy("city").agg(avg("amount").alias("avg_order_value"))
avg_order_city.show()

+---------+------------------+
|     city|   avg_order_value|
+---------+------------------+
|Bangalore|36166.666666666664|
|  Chennai|           62000.0|
|   Mumbai|           37000.0|
|    Delhi|           37400.0|
+---------+------------------+



In [67]:
#18
top_products=product_revenue.orderBy(col("total_revenue").desc()).limit(3)
top_products.show()

+-------+-------------+
|product|total_revenue|
+-------+-------------+
| Laptop|     314000.0|
| Vacuum|      88000.0|
| Tablet|      85000.0|
+-------+-------------+



PHASE 5 — WINDOW FUNCTIONS

Tasks

19. Rank cities by total revenue
20. Rank products within each category by revenue
21. Identify the top product per category

In [68]:
#19
city_window=Window.orderBy(col("total_revenue").desc())
city_rank=city_revenue.withColumn("rank",dense_rank().over(city_window))
city_rank.show()

+---------+-------------+----+
|     city|total_revenue|rank|
+---------+-------------+----+
|Bangalore|     217000.0|   1|
|    Delhi|     187000.0|   2|
|   Mumbai|     185000.0|   3|
|  Chennai|      62000.0|   4|
+---------+-------------+----+



In [69]:
#20
cat_prod_window=Window.partitionBy("category").orderBy(col("total_revenue").desc())
product_rank=df_clean.groupBy("category","product").agg(sum("amount").alias("total_revenue")).withColumn("rank",dense_rank().over(cat_prod_window))
product_rank.show()

+-----------+-----------+-------------+----+
|   category|    product|total_revenue|rank|
+-----------+-----------+-------------+----+
|Electronics|     Laptop|     314000.0|   1|
|Electronics|     Tablet|      85000.0|   2|
|Electronics|     Mobile|      65000.0|   3|
|       Home|     Vacuum|      88000.0|   1|
|       Home|Airpurifier|      78000.0|   2|
|       Home|      Mixer|      21000.0|   3|
+-----------+-----------+-------------+----+



In [70]:
#21
top_product_category=product_rank.filter(col("rank")==1)
top_product_category.show()

+-----------+-------+-------------+----+
|   category|product|total_revenue|rank|
+-----------+-------+-------------+----+
|Electronics| Laptop|     314000.0|   1|
|       Home| Vacuum|      88000.0|   1|
+-----------+-------+-------------+----+



PHASE 6 — PERFORMANCE AWARENESS

Tasks

22. Cache the cleaned DataFrame
23. Run multiple aggregations and observe behavior
24. Use explain(True) to inspect shuffle and execution plan
25. Repartition data by city and explain why

In [72]:
#22
df_clean.cache()
df_clean.count()

17

In [73]:
#23
city_revenue.show()
category_revenue.show()
product_revenue.show()

+---------+-------------+
|     city|total_revenue|
+---------+-------------+
|    Delhi|     187000.0|
|Bangalore|     217000.0|
|   Mumbai|     185000.0|
|  Chennai|      62000.0|
+---------+-------------+

+-----------+-------------+
|   category|total_revenue|
+-----------+-------------+
|       Home|     187000.0|
|Electronics|     464000.0|
+-----------+-------------+

+-----------+-------------+
|    product|total_revenue|
+-----------+-------------+
|     Vacuum|      88000.0|
|     Laptop|     314000.0|
|     Tablet|      85000.0|
|     Mobile|      65000.0|
|Airpurifier|      78000.0|
|      Mixer|      21000.0|
+-----------+-------------+



In [74]:
#24
df_clean.explain(True)

== Parsed Logical Plan ==
'Filter '`=`('status, Completed)
+- Deduplicate [order_id#1059]
   +- Project [order_id#1059, customer_id#1060, city#1061, category#1062, product#1063, amount#1064, order_date#1065, trim(status#655, None) AS status#1066]
      +- Project [order_id#1059, customer_id#1060, city#1061, category#1062, product#1063, amount#1064, trim(cast(order_date#760 as string), None) AS order_date#1065, status#655]
         +- Project [order_id#1059, customer_id#1060, city#1061, category#1062, product#1063, trim(cast(amount#709 as string), None) AS amount#1064, order_date#760, status#655]
            +- Project [order_id#1059, customer_id#1060, city#1061, category#1062, trim(product#683, None) AS product#1063, amount#709, order_date#760, status#655]
               +- Project [order_id#1059, customer_id#1060, city#1061, trim(category#682, None) AS category#1062, product#683, amount#709, order_date#760, status#655]
                  +- Project [order_id#1059, customer_id#1060, tri

In [75]:
#25
df_partitioned=df_clean.repartition(col("city"))
df_partitioned.explain(True)

== Parsed Logical Plan ==
'RepartitionByExpression ['city]
+- Filter (status#1066 = Completed)
   +- Deduplicate [order_id#1059]
      +- Project [order_id#1059, customer_id#1060, city#1061, category#1062, product#1063, amount#1064, order_date#1065, trim(status#655, None) AS status#1066]
         +- Project [order_id#1059, customer_id#1060, city#1061, category#1062, product#1063, amount#1064, trim(cast(order_date#760 as string), None) AS order_date#1065, status#655]
            +- Project [order_id#1059, customer_id#1060, city#1061, category#1062, product#1063, trim(cast(amount#709 as string), None) AS amount#1064, order_date#760, status#655]
               +- Project [order_id#1059, customer_id#1060, city#1061, category#1062, trim(product#683, None) AS product#1063, amount#709, order_date#760, status#655]
                  +- Project [order_id#1059, customer_id#1060, city#1061, trim(category#682, None) AS category#1062, product#683, amount#709, order_date#760, status#655]
            

PHASE 7 — FILE FORMAT OUTPUT

Tasks

26. Write cleaned order-level data to Parquet
27. Write aggregated analytics to ORC
28. Read both back and validate schema

In [76]:
#26
df_clean.write.mode("overwrite").parquet("/data/clean_orders_parquet")

In [77]:
#27
city_revenue.write.mode("overwrite").orc("/data/city_revenue_orc")

In [78]:
#28
parquet_df=spark.read.parquet("/data/clean_orders_parquet")
orc_df=spark.read.orc("/data/city_revenue_orc")

parquet_df.printSchema()
orc_df.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- city: string (nullable = true)
 |-- category: string (nullable = true)
 |-- product: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- order_date: string (nullable = true)
 |-- status: string (nullable = true)

root
 |-- city: string (nullable = true)
 |-- total_revenue: double (nullable = true)



PHASE 8 — DEBUGGING CHECK

Tasks

29. Explain why this line is incorrect:

df = df.filter(df.amount > 30000).show()

30. Write the corrected version

In [79]:
#29
df = df.filter(df.amount > 30000).show()
#.show() returns None
#Assigning df with .show() overwrites it with None
#therefore pipline breaks


NameError: name 'df' is not defined

In [81]:
#30
df_correct = df_clean.filter(col("amount") > 30000)
df.show()

+--------+-----------+---------+-----------+-----------+------+----------+---------+
|order_id|customer_id|     city|   category|    product|amount|order_date|   status|
+--------+-----------+---------+-----------+-----------+------+----------+---------+
|  ORD018|       C008|    Delhi|       Home|     Vacuum| 31000|2024-01-14|Completed|
|  ORD001|       C001|    Delhi|Electronics|     Laptop| 45000|2024-01-05|Completed|
|  ORD002|       C002|   Mumbai|Electronics|     Mobile| 32000|2024-01-05|Completed|
|  ORD017|       C007|Bangalore|Electronics|     Laptop| 51000|2024-01-14|Completed|
|  ORD016|       C006|   Mumbai|       Home|Airpurifier| 40000|2024-01-13|Completed|
|  ORD015|       C005|  Chennai|Electronics|     Laptop| 62000|2024-01-13|Completed|
|  ORD012|       C010|Bangalore|Electronics|     Mobile| 33000|2024-01-11|Completed|
|  ORD020|       C010|Bangalore|Electronics|     Laptop| 54000|2024-01-15|Completed|
|  ORD007|       C001|    Delhi|Electronics|     Laptop| 47000|20