In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

spark=SparkSession.builder.appName("Capstone_Demo").getOrCreate()

#PHASE 1 — DATA INGESTION & SCHEMA MANAGEMENT
Tasks

1. Create schemas explicitly for all datasets
2. Load raw data into DataFrames
3. Handle incorrect data types gracefully
4. Identify corrupt and invalid records

DATASET 1 — SALES TRANSACTIONS (CSV)


In [2]:
sales_data = [
    ("TXN001","Delhi ","Laptop","Electronics","45000","2024-01-05","Completed"),
    ("TXN002","Mumbai","Mobile ","electronics","32000","05/01/2024","Completed"),
    ("TXN003","Bangalore","Tablet"," Electronics ","30000","2024/01/06","Completed"),
    ("TXN004","Delhi","Laptop","Electronics","","2024-01-07","Cancelled"),
    ("TXN005","Chennai","Mobile","Electronics","invalid","2024-01-08","Completed"),
    ("TXN006","Mumbai","Tablet","Electronics",None,"2024-01-08","Completed"),
    ("TXN007","Delhi","Laptop","electronics","45000","09-01-2024","Completed"),
    ("TXN008","Bangalore","Mobile","Electronics","28000","2024-01-09","Completed"),
    ("TXN009","Mumbai","Laptop","Electronics","55000","2024-01-10","Completed"),
    ("TXN009","Mumbai","Laptop","Electronics","55000","2024-01-10","Completed")
]
sales_schema=StructType([
    StructField("txn_id",StringType(),True),
    StructField("city",StringType(),True),
    StructField("product",StringType(),True),
    StructField("category",StringType(),True),
    StructField("amount",StringType(),True),
    StructField("txn_date",StringType(),True),
    StructField("status",StringType(),True)
])
sales_df=spark.createDataFrame(sales_data,sales_schema)
sales_df.show()

+------+---------+-------+-------------+-------+----------+---------+
|txn_id|     city|product|     category| amount|  txn_date|   status|
+------+---------+-------+-------------+-------+----------+---------+
|TXN001|   Delhi | Laptop|  Electronics|  45000|2024-01-05|Completed|
|TXN002|   Mumbai|Mobile |  electronics|  32000|05/01/2024|Completed|
|TXN003|Bangalore| Tablet| Electronics |  30000|2024/01/06|Completed|
|TXN004|    Delhi| Laptop|  Electronics|       |2024-01-07|Cancelled|
|TXN005|  Chennai| Mobile|  Electronics|invalid|2024-01-08|Completed|
|TXN006|   Mumbai| Tablet|  Electronics|   NULL|2024-01-08|Completed|
|TXN007|    Delhi| Laptop|  electronics|  45000|09-01-2024|Completed|
|TXN008|Bangalore| Mobile|  Electronics|  28000|2024-01-09|Completed|
|TXN009|   Mumbai| Laptop|  Electronics|  55000|2024-01-10|Completed|
|TXN009|   Mumbai| Laptop|  Electronics|  55000|2024-01-10|Completed|
+------+---------+-------+-------------+-------+----------+---------+



DATASET 2 — CUSTOMER MASTER

In [3]:
customer_data = [
("C001","Delhi","Premium"),
("C002","Mumbai","Standard"),
("C003","Bangalore","Premium"),
("C004","Chennai","Standard"),
("C005","Mumbai","Premium")
]
customer_schema=StructType([
    StructField("customer_id",StringType(),True),
    StructField("city",StringType(),True),
    StructField("segment",StringType(),True)
])
customer_df=spark.createDataFrame(customer_data,customer_schema)
customer_df.show()

+-----------+---------+--------+
|customer_id|     city| segment|
+-----------+---------+--------+
|       C001|    Delhi| Premium|
|       C002|   Mumbai|Standard|
|       C003|Bangalore| Premium|
|       C004|  Chennai|Standard|
|       C005|   Mumbai| Premium|
+-----------+---------+--------+



DATASET 3 — CITY CLASSIFICATION

In [4]:
city_data = [
("Delhi","Tier-1"),
("Mumbai","Tier-1"),
("Bangalore","Tier-1"),
("Chennai","Tier-2")
]
city_schema=StructType([
    StructField("city",StringType(),True),
    StructField("tier",StringType(),True)
])
city_df=spark.createDataFrame(city_data,city_schema)
city_df.show()

+---------+------+
|     city|  tier|
+---------+------+
|    Delhi|Tier-1|
|   Mumbai|Tier-1|
|Bangalore|Tier-1|
|  Chennai|Tier-2|
+---------+------+



In [5]:
sales_df.filter(col("amount").isNull() | ~col("amount").rlike(r'^[0-9]+$')).show()

+------+-------+-------+-----------+-------+----------+---------+
|txn_id|   city|product|   category| amount|  txn_date|   status|
+------+-------+-------+-----------+-------+----------+---------+
|TXN004|  Delhi| Laptop|Electronics|       |2024-01-07|Cancelled|
|TXN005|Chennai| Mobile|Electronics|invalid|2024-01-08|Completed|
|TXN006| Mumbai| Tablet|Electronics|   NULL|2024-01-08|Completed|
+------+-------+-------+-----------+-------+----------+---------+



PHASE 2 — DATA CLEANING &
TRANSFORMATION

Tasks

5. Trim and normalize string columns
6. Convert category to uppercase
7. Convert amount to integer
8. Handle invalid and null amounts
9. Parse multiple date formats into DateType
10. Remove duplicate transactions
11. Keep only Completed transactions

In [6]:
#5
clean_df=sales_df.withColumn("city",trim(col("city")))\
.withColumn("product",trim(col("product")))\
.withColumn("category",trim(col("category")))\
.withColumn("status",trim(col("status")))

clean_df.show()


+------+---------+-------+-----------+-------+----------+---------+
|txn_id|     city|product|   category| amount|  txn_date|   status|
+------+---------+-------+-----------+-------+----------+---------+
|TXN001|    Delhi| Laptop|Electronics|  45000|2024-01-05|Completed|
|TXN002|   Mumbai| Mobile|electronics|  32000|05/01/2024|Completed|
|TXN003|Bangalore| Tablet|Electronics|  30000|2024/01/06|Completed|
|TXN004|    Delhi| Laptop|Electronics|       |2024-01-07|Cancelled|
|TXN005|  Chennai| Mobile|Electronics|invalid|2024-01-08|Completed|
|TXN006|   Mumbai| Tablet|Electronics|   NULL|2024-01-08|Completed|
|TXN007|    Delhi| Laptop|electronics|  45000|09-01-2024|Completed|
|TXN008|Bangalore| Mobile|Electronics|  28000|2024-01-09|Completed|
|TXN009|   Mumbai| Laptop|Electronics|  55000|2024-01-10|Completed|
|TXN009|   Mumbai| Laptop|Electronics|  55000|2024-01-10|Completed|
+------+---------+-------+-----------+-------+----------+---------+



In [12]:
#6
clean_df=clean_df.withColumn("category",upper(col("category")))
#clean_df.show()

#7
clean_df=clean_df.withColumn("amount_int",when(col("amount").rlike("^[0-9]+$"),col("amount").cast(IntegerType())))
#clean_df.show()

#8
clean_df=clean_df.filter(col("amount_int").isNotNull())
#clean_df.show()

#9
clean_df=clean_df.withColumn("txn_date_parsed",coalesce(
                             try_to_timestamp(col("txn_date"), lit("yyyy-MM-dd")),
                             try_to_timestamp(col("txn_date"), lit("dd-MM-yyyy")),
                             try_to_timestamp(col("txn_date"), lit("MM-dd-yyyy")),
                             try_to_timestamp(col("txn_date"), lit("dd/MM/yyyy")),
                             try_to_timestamp(col("txn_date"), lit("MM/dd/yyyy")),
                             try_to_timestamp(col("txn_date"), lit("yyyy/MM/dd"))
                                                          ).cast(DateType()))
#clean_df.show()

#10
clean_df=clean_df.dropDuplicates()
#clean_df.show()


#11
clean_df=clean_df.filter(col("status").like("Comple%"))
clean_df.show()

+------+---------+-------+-----------+------+----------+---------+----------+---------------+
|txn_id|     city|product|   category|amount|  txn_date|   status|amount_int|txn_date_parsed|
+------+---------+-------+-----------+------+----------+---------+----------+---------------+
|TXN003|Bangalore| Tablet|ELECTRONICS| 30000|2024/01/06|Completed|     30000|     2024-01-06|
|TXN002|   Mumbai| Mobile|ELECTRONICS| 32000|05/01/2024|Completed|     32000|     2024-01-05|
|TXN001|    Delhi| Laptop|ELECTRONICS| 45000|2024-01-05|Completed|     45000|     2024-01-05|
|TXN007|    Delhi| Laptop|ELECTRONICS| 45000|09-01-2024|Completed|     45000|     2024-01-09|
|TXN009|   Mumbai| Laptop|ELECTRONICS| 55000|2024-01-10|Completed|     55000|     2024-01-10|
|TXN008|Bangalore| Mobile|ELECTRONICS| 28000|2024-01-09|Completed|     28000|     2024-01-09|
+------+---------+-------+-----------+------+----------+---------+----------+---------------+

+------+---------+-------+-----------+------+----------+---

PHASE 3 — DATA ENRICHMENT & JOINS

Tasks

12. Join sales data with city lookup
13. Use broadcast join where appropriate
14. Explain join strategy used
15. Enrich sales data with city tier

In [13]:
#12,13
enriched_df=clean_df.join(broadcast(city_df),on="city",how="left")
enriched_df.show()

+---------+------+-------+-----------+------+----------+---------+----------+---------------+------+
|     city|txn_id|product|   category|amount|  txn_date|   status|amount_int|txn_date_parsed|  tier|
+---------+------+-------+-----------+------+----------+---------+----------+---------------+------+
|Bangalore|TXN003| Tablet|ELECTRONICS| 30000|2024/01/06|Completed|     30000|     2024-01-06|Tier-1|
|   Mumbai|TXN002| Mobile|ELECTRONICS| 32000|05/01/2024|Completed|     32000|     2024-01-05|Tier-1|
|    Delhi|TXN001| Laptop|ELECTRONICS| 45000|2024-01-05|Completed|     45000|     2024-01-05|Tier-1|
|    Delhi|TXN007| Laptop|ELECTRONICS| 45000|09-01-2024|Completed|     45000|     2024-01-09|Tier-1|
|   Mumbai|TXN009| Laptop|ELECTRONICS| 55000|2024-01-10|Completed|     55000|     2024-01-10|Tier-1|
|Bangalore|TXN008| Mobile|ELECTRONICS| 28000|2024-01-09|Completed|     28000|     2024-01-09|Tier-1|
+---------+------+-------+-----------+------+----------+---------+----------+--------------

In [14]:
#14
enriched_df.explain(True)

== Parsed Logical Plan ==
'Join UsingJoin(LeftOuter, [city])
:- Filter status#76 LIKE Comple%
:  +- Deduplicate [city#73, amount_int#261, amount#4, status#76, txn_date_parsed#262, txn_id#0, product#74, txn_date#5, category#260]
:     +- Project [txn_id#0, city#73, product#74, category#260, amount#4, txn_date#5, status#76, amount_int#261, cast(coalesce(try_to_timestamp(txn_date#5, Some(yyyy-MM-dd), TimestampType, Some(Etc/UTC), false), try_to_timestamp(txn_date#5, Some(dd-MM-yyyy), TimestampType, Some(Etc/UTC), false), try_to_timestamp(txn_date#5, Some(MM-dd-yyyy), TimestampType, Some(Etc/UTC), false), try_to_timestamp(txn_date#5, Some(dd/MM/yyyy), TimestampType, Some(Etc/UTC), false), try_to_timestamp(txn_date#5, Some(MM/dd/yyyy), TimestampType, Some(Etc/UTC), false), try_to_timestamp(txn_date#5, Some(yyyy/MM/dd), TimestampType, Some(Etc/UTC), false)) as date) AS txn_date_parsed#262]
:        +- Filter isnotnull(amount_int#261)
:           +- Project [txn_id#0, city#73, product#74, cat

In [15]:
#15
enriched_df.printSchema()

root
 |-- city: string (nullable = true)
 |-- txn_id: string (nullable = true)
 |-- product: string (nullable = true)
 |-- category: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- txn_date: string (nullable = true)
 |-- status: string (nullable = true)
 |-- amount_int: integer (nullable = true)
 |-- txn_date_parsed: date (nullable = true)
 |-- tier: string (nullable = true)



PHASE 4 — ANALYTICS & WINDOW FUNCTIONS

Tasks

16. Revenue per city
17. Revenue per product
18. Rank cities by total revenue
19. Rank products within each city
20. Identify top-performing city per day

In [20]:
#16
revenue_city=enriched_df.groupBy("city").agg(sum("amount_int").alias("total_revenue"))
revenue_city.show()

#17
revenue_product=enriched_df.groupBy("product").agg(sum("amount_int").alias("total_revenue"))
revenue_product.show()

#18
city_rank_window=Window.orderBy(desc("total_revenue"))
revenue_city=revenue_city.withColumn("city_rank",rank().over(city_rank_window))
revenue_city.show()

#19
product_city_window=Window.partitionBy("city").orderBy(desc("amount_int"))
ranked_products=enriched_df.withColumn("product_rank",rank().over(product_city_window))
ranked_products.show()

#20
daily_city_window=Window.partitionBy("txn_date_parsed").orderBy(desc("amount_int"))
top_city_daily=enriched_df.withColumn("rank",rank().over(daily_city_window)).filter(col("rank")==1)
top_city_daily.show()

+---------+-------------+
|     city|total_revenue|
+---------+-------------+
|Bangalore|        58000|
|   Mumbai|        87000|
|    Delhi|        90000|
+---------+-------------+

+-------+-------------+
|product|total_revenue|
+-------+-------------+
| Laptop|       145000|
| Mobile|        60000|
| Tablet|        30000|
+-------+-------------+

+---------+-------------+---------+
|     city|total_revenue|city_rank|
+---------+-------------+---------+
|    Delhi|        90000|        1|
|   Mumbai|        87000|        2|
|Bangalore|        58000|        3|
+---------+-------------+---------+

+---------+------+-------+-----------+------+----------+---------+----------+---------------+------+------------+
|     city|txn_id|product|   category|amount|  txn_date|   status|amount_int|txn_date_parsed|  tier|product_rank|
+---------+------+-------+-----------+------+----------+---------+----------+---------------+------+------------+
|Bangalore|TXN003| Tablet|ELECTRONICS| 30000|2024/01/

PHASE 5 — CACHING, PARTITIONS & OPTIMIZATION

Tasks

21. Identify reusable DataFrames
22. Apply caching appropriately
23. Compare performance with and without cache
24. Repartition data by city
25. Explain why partitioning helps

In [23]:
#21
#Resuable DataFrame is enriched_df

#22,23
enriched_df.cache()
enriched_df.count()

6

In [24]:
enriched_df.groupBy("city").sum("amount_int").show()

+---------+---------------+
|     city|sum(amount_int)|
+---------+---------------+
|Bangalore|          58000|
|    Delhi|          90000|
|   Mumbai|          87000|
+---------+---------------+



In [26]:
#24
partitioned_df=enriched_df.repartition("city")
partitioned_df.rdd.getNumPartitions()

1

PHASE 6 — FILE FORMAT STRATEGY

Tasks

26. Write cleaned data to Parquet
27. Write aggregated data to ORC
28. Compare file structure and size
29. Explain why Avro is not used here
30. Design a future streaming ingestion using Avro

In [27]:
#26
partitioned_df.write.mode("overwrite").partitionBy("city").parquet("/data/clean_sales_parquet")

In [28]:
#27
revenue_city.write.mode("overwrite").orc("/data/agg_revenue_orc")

PHASE 7 — DEBUGGING & ERROR HANDLING

Tasks

31. Identify common mistakes (intentional bugs)
32. Debug schema mismatch errors
33. Debug NoneType DataFrame errors
34. Use explain() to identify inefficiencies

In [29]:
sales_df.explain(True)

== Parsed Logical Plan ==
LogicalRDD [txn_id#0, city#1, product#2, category#3, amount#4, txn_date#5, status#6], false

== Analyzed Logical Plan ==
txn_id: string, city: string, product: string, category: string, amount: string, txn_date: string, status: string
LogicalRDD [txn_id#0, city#1, product#2, category#3, amount#4, txn_date#5, status#6], false

== Optimized Logical Plan ==
LogicalRDD [txn_id#0, city#1, product#2, category#3, amount#4, txn_date#5, status#6], false

== Physical Plan ==
*(1) Scan ExistingRDD[txn_id#0,city#1,product#2,category#3,amount#4,txn_date#5,status#6]



In [30]:
customer_df.explain(True)

== Parsed Logical Plan ==
LogicalRDD [customer_id#29, city#30, segment#31], false

== Analyzed Logical Plan ==
customer_id: string, city: string, segment: string
LogicalRDD [customer_id#29, city#30, segment#31], false

== Optimized Logical Plan ==
LogicalRDD [customer_id#29, city#30, segment#31], false

== Physical Plan ==
*(1) Scan ExistingRDD[customer_id#29,city#30,segment#31]



In [31]:
city_df.explain(True)

== Parsed Logical Plan ==
LogicalRDD [city#42, tier#43], false

== Analyzed Logical Plan ==
city: string, tier: string
LogicalRDD [city#42, tier#43], false

== Optimized Logical Plan ==
LogicalRDD [city#42, tier#43], false

== Physical Plan ==
*(1) Scan ExistingRDD[city#42,tier#43]



PHASE 8 — FINAL VALIDATION &
DELIVERABLES

Tasks

35. Validate record counts
36. Ensure no nulls in critical fields
37. Confirm schema correctness
38. Document optimization decisions

In [32]:
#35
clean_df.count()
enriched_df.count()

6

In [33]:
#36
enriched_df.select([count(when(col(c).isNull(),c)).alias(c)
    for c in ["txn_id","city","amount_int","txn_date_parsed"]
]).show()

+------+----+----------+---------------+
|txn_id|city|amount_int|txn_date_parsed|
+------+----+----------+---------------+
|     0|   0|         0|              0|
+------+----+----------+---------------+



In [34]:
#37
enriched_df.printSchema()

root
 |-- city: string (nullable = true)
 |-- txn_id: string (nullable = true)
 |-- product: string (nullable = true)
 |-- category: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- txn_date: string (nullable = true)
 |-- status: string (nullable = true)
 |-- amount_int: integer (nullable = true)
 |-- txn_date_parsed: date (nullable = true)
 |-- tier: string (nullable = true)

