DATASET 1 — CUSTOMER MASTER (CORRUPTED)

In [None]:
raw_customers = [
("C001","Rahul","29","Bangalore","Electronics,Fashion"),
("C002","Sneha","Thirty Two","Delhi","Fashion"),
("C003","Aman",None,"Mumbai",["Home","Electronics"]),
("C004","Pallavi","27","Pune","Electronics|Beauty"),

("C005","", "35","Chennai",None)
]

DATASET 2 — SELLER MASTER

In [None]:
raw_sellers = [
("S001","TechWorld","Electronics","2019-06-01"),
("S002","FashionHub","Fashion","01/07/2020"),
("S003","HomeEssentials","Home","2018/09/15"),
("S004","BeautyStore","Beauty","invalid_date")
]

DATASET 3 — PRODUCT CATALOG

In [None]:
raw_products = [
("P001","Laptop","Electronics","S001","55000"),
("P002","Headphones","Electronics","S001","2500"),
("P003","T-Shirt","Fashion","S002","1200"),
("P004","Sofa","Home","S003","45000"),
("P005","Face Cream","Beauty","S004","800")
]

DATASET 4 — ORDERS DATA

In [None]:
raw_orders = [
("O001","C001","P001","2024-01-05","Delivered","55000"),
("O002","C002","P003","05/01/2024","Cancelled","0"),
("O003","C003","P004","2024/01/06","Delivered","45000"),
("O004","C004","P005","invalid_date","Delivered","800"),
("O005","C001","P002","2024-01-10","Delivered","2500"),
("O006","C005","P003","2024-01-12","Delivered","1200")
]

DATASET 5 — CUSTOMER ACTIVITY LOGS

In [None]:
raw_activity = [
("C001","search,view,add_to_cart","{'device':'mobile'}",180),
("C002",["search","view"],"device=laptop",90),
("C003","search|view|purchase",None,120),
("C004",None,"{'device':'tablet'}",60),
("C005","search","{'device':'mobile'}",30)
]

PART A — DATA CLEANING & STRUCTURING

1. Design explicit schemas for all datasets
2. Normalize:

- Age
- Prices
- Dates

3. Convert interests and actions into arrays
4. Handle missing and invalid records gracefully
5. Produce clean DataFrames:

- customers_df
- sellers_df
- products_df
- orders_df
- activity_df

In [None]:
#1

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = SparkSession.builder.appName("E-CommerceAnalytics").getOrCreate()

In [3]:
raw_customers = [
("C001","Rahul","29","Bangalore","Electronics,Fashion"),
("C002","Sneha","Thirty Two","Delhi","Fashion"),
("C003","Aman",None,"Mumbai",["Home","Electronics"]),
("C004","Pallavi","27","Pune","Electronics|Beauty"),

("C005","", "35","Chennai",None)
]
customer_schema = StructType([
StructField("customer_id", StringType(), True),
StructField("name", StringType(), True),
StructField("age", StringType(), True),
StructField("city", StringType(), True),
StructField("interests", StringType(), True)
])
raw_customers_df = spark.createDataFrame(raw_customers, schema=customer_schema)

In [4]:
raw_sellers = [
("S001","TechWorld","Electronics","2019-06-01"),
("S002","FashionHub","Fashion","01/07/2020"),
("S003","HomeEssentials","Home","2018/09/15"),
("S004","BeautyStore","Beauty","invalid_date")
]
seller_schema = StructType([
StructField("seller_id", StringType(), True),
StructField("seller_name", StringType(), True),
StructField("category", StringType(), True),
StructField("start_date", StringType(), True)
])
raw_sellers_df = spark.createDataFrame(raw_sellers, schema=seller_schema)

In [5]:
raw_products = [
("P001","Laptop","Electronics","S001","55000"),
("P002","Headphones","Electronics","S001","2500"),
("P003","T-Shirt","Fashion","S002","1200"),
("P004","Sofa","Home","S003","45000"),
("P005","Face Cream","Beauty","S004","800")
]
product_schema = StructType([
StructField("product_id", StringType(), True),
StructField("product", StringType(), True),
StructField("category", StringType(), True),
StructField("seller_id", StringType(), True),
StructField("start_date", StringType(), True)
])
raw_products_df = spark.createDataFrame(raw_products, schema=product_schema)

In [6]:
raw_orders = [
("O001","C001","P001","2024-01-05","Delivered","55000"),
("O002","C002","P003","05/01/2024","Cancelled","0"),
("O003","C003","P004","2024/01/06","Delivered","45000"),
("O004","C004","P005","invalid_date","Delivered","800"),
("O005","C001","P002","2024-01-10","Delivered","2500"),
("O006","C005","P003","2024-01-12","Delivered","1200")
]
order_schema = StructType([
StructField("order_id", StringType(), True),
StructField("customer_id", StringType(), True),
StructField("product_id", StringType(), True),
StructField("order_date", StringType(), True),
StructField("status", StringType(), True),
StructField("amount", StringType(), True)
])
raw_orders_df = spark.createDataFrame(raw_orders, schema=order_schema)

In [7]:
raw_activity = [
("C001","search,view,add_to_cart","{'device':'mobile'}",180),
("C002",["search","view"],"device=laptop",90),
("C003","search|view|purchase",None,120),
("C004",None,"{'device':'tablet'}",60),
("C005","search","{'device':'mobile'}",30)
]
activity_schema = StructType([
StructField("customer_id", StringType(), True),
StructField("actions", StringType(), True),
StructField("metadata", StringType(), True),
StructField("duration", IntegerType(), True)
])
raw_activity_df = spark.createDataFrame(raw_activity, schema=activity_schema)

In [8]:
#2
from pyspark.sql.functions import regexp_extract, col, when

customers_df = raw_customers_df \
    .withColumn("age", when(regexp_extract("age", "\d+", 0) == "", None)
                .otherwise(regexp_extract("age", "\d+", 0)).cast("int")) \
    .withColumn("name", when(col("name") == "", None).otherwise(col("name")))

customers_df.show()

  .withColumn("age", when(regexp_extract("age", "\d+", 0) == "", None)
  .otherwise(regexp_extract("age", "\d+", 0)).cast("int")) \


+-----------+-------+----+---------+-------------------+
|customer_id|   name| age|     city|          interests|
+-----------+-------+----+---------+-------------------+
|       C001|  Rahul|  29|Bangalore|Electronics,Fashion|
|       C002|  Sneha|NULL|    Delhi|            Fashion|
|       C003|   Aman|NULL|   Mumbai|[Home, Electronics]|
|       C004|Pallavi|  27|     Pune| Electronics|Beauty|
|       C005|   NULL|  35|  Chennai|               NULL|
+-----------+-------+----+---------+-------------------+



In [9]:
products_df = raw_products_df.withColumnRenamed("start_date", "price").withColumn("price", col("price").cast("int"))
products_df.show()

+----------+----------+-----------+---------+-----+
|product_id|   product|   category|seller_id|price|
+----------+----------+-----------+---------+-----+
|      P001|    Laptop|Electronics|     S001|55000|
|      P002|Headphones|Electronics|     S001| 2500|
|      P003|   T-Shirt|    Fashion|     S002| 1200|
|      P004|      Sofa|       Home|     S003|45000|
|      P005|Face Cream|     Beauty|     S004|  800|
+----------+----------+-----------+---------+-----+



In [10]:
orders_df =raw_orders_df.withColumn("order_date",
                                    coalesce(
                                        to_date(try_to_timestamp(col("order_date"),lit("yyyy-MM-dd"))),
                                        to_date(try_to_timestamp(col("order_date"),lit("dd/MM/yyyy"))),
                                        to_date(try_to_timestamp(col("order_date"),lit("yyyy/MM/dd")))
                                    )
                                   )
orders_df.show()

+--------+-----------+----------+----------+---------+------+
|order_id|customer_id|product_id|order_date|   status|amount|
+--------+-----------+----------+----------+---------+------+
|    O001|       C001|      P001|2024-01-05|Delivered| 55000|
|    O002|       C002|      P003|2024-01-05|Cancelled|     0|
|    O003|       C003|      P004|2024-01-06|Delivered| 45000|
|    O004|       C004|      P005|      NULL|Delivered|   800|
|    O005|       C001|      P002|2024-01-10|Delivered|  2500|
|    O006|       C005|      P003|2024-01-12|Delivered|  1200|
+--------+-----------+----------+----------+---------+------+



In [11]:
#3
from pyspark.sql.functions import split, regexp_replace

customers_df = customers_df.withColumn(
    "interests",
    split(regexp_replace("interests", "[|]", ","), ",")
)



from pyspark.sql.functions import split, regexp_replace

activity_df = raw_activity_df.withColumn(
    "actions",
    split(regexp_replace("actions", "[|]", ","), ",")
)

In [12]:
#4
from pyspark.sql.functions import col, to_date, coalesce, split, lit, array_remove, try_to_timestamp

# Make an empty string array: split("", ",") -> [""] then remove "" -> []
empty_string_array = array_remove(split(lit(""), ","), "")

customers_df = customers_df.withColumn(
    "interests",
    coalesce(col("interests"), empty_string_array)
)

orders_df = orders_df.filter(col("order_date").isNotNull())

sellers_df = raw_sellers_df.withColumn(
    "start_date",
    coalesce(
        to_date(try_to_timestamp(col("start_date"), lit("yyyy-MM-dd"))),
        to_date(try_to_timestamp(col("start_date"), lit("dd/MM/yyyy"))),
        to_date(try_to_timestamp(col("start_date"), lit("yyyy/MM/dd")))
    )
)

In [13]:
#5
customers_df.printSchema()
customers_df.show()
sellers_df.printSchema()
sellers_df.show()
products_df.printSchema()
products_df.show()
orders_df.printSchema()
orders_df.show()
activity_df.printSchema()
activity_df.show()

root
 |-- customer_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- city: string (nullable = true)
 |-- interests: array (nullable = false)
 |    |-- element: string (containsNull = false)

+-----------+-------+----+---------+--------------------+
|customer_id|   name| age|     city|           interests|
+-----------+-------+----+---------+--------------------+
|       C001|  Rahul|  29|Bangalore|[Electronics, Fas...|
|       C002|  Sneha|NULL|    Delhi|           [Fashion]|
|       C003|   Aman|NULL|   Mumbai|[[Home,  Electron...|
|       C004|Pallavi|  27|     Pune|[Electronics, Bea...|
|       C005|   NULL|  35|  Chennai|                  []|
+-----------+-------+----+---------+--------------------+

root
 |-- seller_id: string (nullable = true)
 |-- seller_name: string (nullable = true)
 |-- category: string (nullable = true)
 |-- start_date: date (nullable = true)

+---------+--------------+-----------+----------+
|seller_id

PART B — DATA INTEGRATION (JOINS)

6. Join orders with products
7. Join products with sellers
8. Join orders with customers
9. Decide which table(s) should be broadcast
10. Prove your decision using explain(True)
11. Eliminate orphan records

In [14]:
#6
orders_products_df  = orders_df.join(products_df, "product_id", "inner")
orders_products_df.show()

#7
products_seller_df  = products_df.join(broadcast(sellers_df), "seller_id", "inner")
products_seller_df.show()

#8
orders_customers_df  = orders_df.join(customers_df, "customer_id", "inner")
orders_customers_df.show()

+----------+--------+-----------+----------+---------+------+----------+-----------+---------+-----+
|product_id|order_id|customer_id|order_date|   status|amount|   product|   category|seller_id|price|
+----------+--------+-----------+----------+---------+------+----------+-----------+---------+-----+
|      P001|    O001|       C001|2024-01-05|Delivered| 55000|    Laptop|Electronics|     S001|55000|
|      P002|    O005|       C001|2024-01-10|Delivered|  2500|Headphones|Electronics|     S001| 2500|
|      P003|    O002|       C002|2024-01-05|Cancelled|     0|   T-Shirt|    Fashion|     S002| 1200|
|      P003|    O006|       C005|2024-01-12|Delivered|  1200|   T-Shirt|    Fashion|     S002| 1200|
|      P004|    O003|       C003|2024-01-06|Delivered| 45000|      Sofa|       Home|     S003|45000|
+----------+--------+-----------+----------+---------+------+----------+-----------+---------+-----+

+---------+----------+----------+-----------+-----+--------------+-----------+----------+


In [15]:
#9
from pyspark.sql.functions import broadcast

# Join orders_df with broadcasted customers_df
orders_customers_broadcast_df = orders_df.join(broadcast(customers_df), "customer_id", "inner")

print("Physical plan for join with broadcasted customers_df:")
orders_customers_broadcast_df.explain(True)

Physical plan for join with broadcasted customers_df:
== Parsed Logical Plan ==
'Join UsingJoin(Inner, [customer_id])
:- Filter isnotnull(order_date#60)
:  +- Project [order_id#14, customer_id#15, product_id#16, coalesce(to_date(try_to_timestamp(order_date#17, Some(yyyy-MM-dd), TimestampType, Some(Etc/UTC), false), None, Some(Etc/UTC), true), to_date(try_to_timestamp(order_date#17, Some(dd/MM/yyyy), TimestampType, Some(Etc/UTC), false), None, Some(Etc/UTC), true), to_date(try_to_timestamp(order_date#17, Some(yyyy/MM/dd), TimestampType, Some(Etc/UTC), false), None, Some(Etc/UTC), true)) AS order_date#60, status#18, amount#19]
:     +- LogicalRDD [order_id#14, customer_id#15, product_id#16, order_date#17, status#18, amount#19], false
+- ResolvedHint (strategy=broadcast)
   +- Project [customer_id#0, name#25, age#24, city#3, coalesce(interests#80, array_remove(split(, ,, -1), )) AS interests#82]
      +- Project [customer_id#0, name#25, age#24, city#3, split(regexp_replace(interests#4, [|

In [16]:
#10
from pyspark.sql.functions import broadcast

# Join orders_df with broadcasted customers_df
orders_customers_broadcast_df = orders_df.join(broadcast(customers_df), "customer_id", "inner")

print("Physical plan for join with broadcasted customers_df:")
orders_customers_broadcast_df.explain(True)

Physical plan for join with broadcasted customers_df:
== Parsed Logical Plan ==
'Join UsingJoin(Inner, [customer_id])
:- Filter isnotnull(order_date#60)
:  +- Project [order_id#14, customer_id#15, product_id#16, coalesce(to_date(try_to_timestamp(order_date#17, Some(yyyy-MM-dd), TimestampType, Some(Etc/UTC), false), None, Some(Etc/UTC), true), to_date(try_to_timestamp(order_date#17, Some(dd/MM/yyyy), TimestampType, Some(Etc/UTC), false), None, Some(Etc/UTC), true), to_date(try_to_timestamp(order_date#17, Some(yyyy/MM/dd), TimestampType, Some(Etc/UTC), false), None, Some(Etc/UTC), true)) AS order_date#60, status#18, amount#19]
:     +- LogicalRDD [order_id#14, customer_id#15, product_id#16, order_date#17, status#18, amount#19], false
+- ResolvedHint (strategy=broadcast)
   +- Project [customer_id#0, name#25, age#24, city#3, coalesce(interests#80, array_remove(split(, ,, -1), )) AS interests#82]
      +- Project [customer_id#0, name#25, age#24, city#3, split(regexp_replace(interests#4, [|

In [17]:
#11
# 1. Eliminate customers without any orders
customers_with_orders_df = customers_df.join(orders_df, "customer_id", "left_semi")
orphan_customers_df = customers_df.join(orders_df, "customer_id", "left_anti")

print("Orphan Customers:")
orphan_customers_df.show()

# Update customers_df to only include customers with orders
customers_df = customers_with_orders_df

# 2. Eliminate products without any orders
products_with_orders_df = products_df.join(orders_df, "product_id", "left_semi")
orphan_products_df = products_df.join(orders_df, "product_id", "left_anti")

print("Orphan Products:")
orphan_products_df.show()

# Update products_df to only include products with orders
products_df = products_with_orders_df

# 3. Eliminate sellers without any products
sellers_with_products_df = sellers_df.join(products_df, "seller_id", "left_semi")
orphan_sellers_df = sellers_df.join(products_df, "seller_id", "left_anti")

print("Orphan Sellers:")
orphan_sellers_df.show()

# Update sellers_df to only include sellers with products
sellers_df = sellers_with_products_df

print("DataFrames after eliminating orphan records:")
customers_df.show()
products_df.show()
sellers_df.show()

Orphan Customers:
+-----------+-------+---+----+--------------------+
|customer_id|   name|age|city|           interests|
+-----------+-------+---+----+--------------------+
|       C004|Pallavi| 27|Pune|[Electronics, Bea...|
+-----------+-------+---+----+--------------------+

Orphan Products:
+----------+----------+--------+---------+-----+
|product_id|   product|category|seller_id|price|
+----------+----------+--------+---------+-----+
|      P005|Face Cream|  Beauty|     S004|  800|
+----------+----------+--------+---------+-----+

Orphan Sellers:
+---------+-----------+--------+----------+
|seller_id|seller_name|category|start_date|
+---------+-----------+--------+----------+
|     S004|BeautyStore|  Beauty|      NULL|
+---------+-----------+--------+----------+

DataFrames after eliminating orphan records:
+-----------+-----+----+---------+--------------------+
|customer_id| name| age|     city|           interests|
+-----------+-----+----+---------+--------------------+
|       

PART C — ANALYTICS & AGGREGATIONS

12. Total revenue per category
13. Total revenue per seller
14. Total orders per customer
15. Average order value per customer
16. Identify sellers with zero delivered orders

In [18]:
#12
revenue_category_df = orders_products_df.groupBy("category").agg(sum("amount").alias("total_revenue"))
revenue_category_df.show()

#13
revenue_seller_df = orders_products_df.groupBy("seller_id").agg(sum("amount").alias("total_revenue"))
revenue_seller_df.show()

#14
orders_customers_df = orders_df.groupBy("customer_id").agg(count("order_id").alias("total_orders"))
orders_customers_df.show()

#15
average_order_value_df = orders_df.withColumn("amount", col("amount").cast("double")) \
    .groupBy("customer_id") \
    .agg(avg("amount").alias("average_order_value"))
average_order_value_df.show()

#16
average_order_value_df = orders_df.withColumn("amount", col("amount").cast("double")) \
    .groupBy("customer_id") \
    .agg(avg("amount").alias("average_order_value"))
average_order_value_df.show()

+-----------+-------------+
|   category|total_revenue|
+-----------+-------------+
|       Home|      45000.0|
|    Fashion|       1200.0|
|Electronics|      57500.0|
+-----------+-------------+

+---------+-------------+
|seller_id|total_revenue|
+---------+-------------+
|     S001|      57500.0|
|     S002|       1200.0|
|     S003|      45000.0|
+---------+-------------+

+-----------+------------+
|customer_id|total_orders|
+-----------+------------+
|       C003|           1|
|       C001|           2|
|       C002|           1|
|       C005|           1|
+-----------+------------+

+-----------+-------------------+
|customer_id|average_order_value|
+-----------+-------------------+
|       C003|            45000.0|
|       C001|            28750.0|
|       C002|                0.0|
|       C005|             1200.0|
+-----------+-------------------+

+-----------+-------------------+
|customer_id|average_order_value|
+-----------+-------------------+
|       C003|            450

PART D — WINDOW FUNCTIONS

17. Rank customers by total spend (overall)
18. Rank sellers by revenue within each category
19. Calculate running revenue per day
20. Identify top 2 products per category by revenue

In [19]:
#17
from pyspark.sql.window import Window
from pyspark.sql.functions import sum, col, rank

# Calculate total spend per customer
total_spend_per_customer_df = orders_products_df.groupBy("customer_id") \
    .agg(sum(col("amount").cast("double")).alias("total_spend"))

# Define a window specification to rank customers by total spend
window_spec = Window.orderBy(col("total_spend").desc())

# Apply the rank function
customer_spend_rank_df = total_spend_per_customer_df.withColumn("spend_rank", rank().over(window_spec))

customer_spend_rank_df.show()

+-----------+-----------+----------+
|customer_id|total_spend|spend_rank|
+-----------+-----------+----------+
|       C001|    57500.0|         1|
|       C003|    45000.0|         2|
|       C005|     1200.0|         3|
|       C002|        0.0|         4|
+-----------+-----------+----------+



In [20]:
#18
from pyspark.sql.window import Window
from pyspark.sql.functions import sum, col, rank

revenue_per_seller_category_df = orders_products_df.groupBy("category", "seller_id") \
    .agg(sum(col("amount").cast("double")).alias("total_revenue"))

window_spec_category = Window.partitionBy("category").orderBy(col("total_revenue").desc())

seller_category_rank_df = revenue_per_seller_category_df.withColumn("category_rank", rank().over(window_spec_category))

seller_category_rank_df.show()

+-----------+---------+-------------+-------------+
|   category|seller_id|total_revenue|category_rank|
+-----------+---------+-------------+-------------+
|Electronics|     S001|      57500.0|            1|
|    Fashion|     S002|       1200.0|            1|
|       Home|     S003|      45000.0|            1|
+-----------+---------+-------------+-------------+



In [21]:
#19
from pyspark.sql.window import Window
from pyspark.sql.functions import sum, col, asc

daily_revenue_df = orders_products_df.withColumn("amount", col("amount").cast("double")) \
    .groupBy("order_date") \
    .agg(sum("amount").alias("daily_revenue"))


window_spec_daily = Window.orderBy(asc("order_date"))

running_revenue_df = daily_revenue_df.withColumn("running_revenue", sum("daily_revenue").over(window_spec_daily))

running_revenue_df.show()

+----------+-------------+---------------+
|order_date|daily_revenue|running_revenue|
+----------+-------------+---------------+
|2024-01-05|      55000.0|        55000.0|
|2024-01-06|      45000.0|       100000.0|
|2024-01-10|       2500.0|       102500.0|
|2024-01-12|       1200.0|       103700.0|
+----------+-------------+---------------+



In [22]:
#20
from pyspark.sql.window import Window
from pyspark.sql.functions import sum, col, rank

product_revenue_per_category_df = orders_products_df.groupBy("category", "product_id", "product") \
    .agg(sum(col("amount").cast("double")).alias("product_revenue"))

window_spec_product_rank = Window.partitionBy("category").orderBy(col("product_revenue").desc())

top_2_products_per_category_df = product_revenue_per_category_df.withColumn("rank", rank().over(window_spec_product_rank)) \
    .filter(col("rank") <= 2)

top_2_products_per_category_df.show()

+-----------+----------+----------+---------------+----+
|   category|product_id|   product|product_revenue|rank|
+-----------+----------+----------+---------------+----+
|Electronics|      P001|    Laptop|        55000.0|   1|
|Electronics|      P002|Headphones|         2500.0|   2|
|    Fashion|      P003|   T-Shirt|         1200.0|   1|
|       Home|      P004|      Sofa|        45000.0|   1|
+-----------+----------+----------+---------------+----+



PART E — UDF (ONLY IF REQUIRED)

21. Classify customers into spending tiers:

- High
- Medium
- Low

Rules:

- Prefer built-in functions
- Use UDF only if unavoidable
- Justify your choice

In [23]:
from pyspark.sql.functions import col, when
customer_spending_tiers_df = total_spend_per_customer_df.withColumn(
    "spending_tier",
    when(col("total_spend") > 10000, "High")
    .when((col("total_spend") > 1000) & (col("total_spend") <= 10000), "Medium")
    .otherwise("Low")
)

print("Customers classified into spending tiers:")
customer_spending_tiers_df.show()

# Justification for not using a UDF:
# PySpark's `when().otherwise()` provides native, optimized functionality for conditional logic.
# It is executed within the Spark engine, benefiting from Catalyst Optimizer and Tungsten execution engine,
# leading to significantly better performance compared to Python UDFs. UDFs involve serialization/deserialization
# overhead and context switching between JVM and Python, which can be very slow for large datasets.
# Since `when().otherwise()` perfectly handles the tier classification logic, a UDF is unnecessary and less efficient

Customers classified into spending tiers:
+-----------+-----------+-------------+
|customer_id|total_spend|spending_tier|
+-----------+-----------+-------------+
|       C003|    45000.0|         High|
|       C005|     1200.0|       Medium|
|       C001|    57500.0|         High|
|       C002|        0.0|          Low|
+-----------+-----------+-------------+



PART F — SORTING & ORDERING

22. Sort categories by total revenue (descending)
23. Sort sellers by revenue within category
24. Explain why sorting caused a shuffle

In [24]:
#22
from pyspark.sql.functions import desc

sorted_categories_by_revenue_df = revenue_category_df.orderBy(desc("total_revenue"))
sorted_categories_by_revenue_df.show()

+-----------+-------------+
|   category|total_revenue|
+-----------+-------------+
|Electronics|      57500.0|
|       Home|      45000.0|
|    Fashion|       1200.0|
+-----------+-------------+



In [25]:
#23
from pyspark.sql.functions import col
sorted_sellers_by_category_revenue_df = seller_category_rank_df.orderBy(col("category").asc(), col("total_revenue").desc())

print("Sellers sorted by revenue within each category:")
sorted_sellers_by_category_revenue_df.show()

Sellers sorted by revenue within each category:
+-----------+---------+-------------+-------------+
|   category|seller_id|total_revenue|category_rank|
+-----------+---------+-------------+-------------+
|Electronics|     S001|      57500.0|            1|
|    Fashion|     S002|       1200.0|            1|
|       Home|     S003|      45000.0|            1|
+-----------+---------+-------------+-------------+



In [None]:
# Sorting a DataFrame in Spark often triggers a 'shuffle' operation.
# A shuffle is the process of redistributing data across partitions (and potentially across machines in a cluster).
# This is necessary because to perform a global sort (or even a sort within groups if data is not pre-partitioned
# or pre-sorted), all data relevant to a specific sort key range might need to be collected on the same partition.
# For example, when sorting categories by total revenue, Spark needs to know the total revenue for all categories
# to correctly order them. If different parts of a category's data reside on different partitions,
# Spark must move this data to ensure a consistent global order. This involves serializing data,
# sending it over the network, and deserializing it on the receiving end, which is a resource-intensive operation.

PART G — SET OPERATIONS

Create two DataFrames:
- Customers who placed orders
- Customers who were active (search/view)
25. Find customers who were active but never ordered
26. Find customers who ordered and were active
27. Explain why set operations differ from joins

In [26]:
ordered_customers_df = orders_df.select("customer_id").distinct()
active_customers_df = activity_df.filter(col("actions").isNotNull()).select("customer_id").distinct()

In [27]:
#25
active_customers_df.subtract(ordered_customers_df).show()

+-----------+
|customer_id|
+-----------+
+-----------+



In [28]:
#26
active_customers_df.intersect(ordered_customers_df).show()

+-----------+
|customer_id|
+-----------+
|       C003|
|       C005|
|       C001|
|       C002|
+-----------+



In [29]:
#27
# Differences between Set Operations and Joins:
#
# Set Operations (UNION, INTERSECT, EXCEPT/SUBTRACT):
# - Operate on the *rows* of DataFrames.
# - Require the DataFrames to have a compatible schema (same number of columns, same column names, and compatible data types).
# - Combine or compare rows based on their *entire content*.
# - The result has the same schema as the input DataFrames.
#
# Join Operations (INNER, LEFT, RIGHT, FULL, ANTI, SEMI):
# - Combine *columns* from two DataFrames.
# - Combine data based on a *common key* or a specified condition.
# - Typically result in a wider DataFrame (more columns) by merging information from both DataFrames.
# - The schema of the result is a combination of the schemas of the input DataFrames (excluding duplicate join keys if specified).

print("\n--- Set Operations (Operating on rows) ---")
print("Customers active but never ordered (using subtract):")
active_customers_df.subtract(ordered_customers_df).show()

print("Customers who ordered AND were active (using intersect):")
active_customers_df.intersect(ordered_customers_df).show()

print("\n--- Join Operations (Operating on columns based on keys) ---")
print("Inner Join: Combining customer and order details for matching customer_ids:")
orders_df.join(customers_df, "customer_id", "inner").show()

print("Left Anti Join: Customers from 'customers_df' who are NOT in 'orders_df' (different from subtract, key-based):")
customers_df.join(orders_df, "customer_id", "left_anti").show()


--- Set Operations (Operating on rows) ---
Customers active but never ordered (using subtract):
+-----------+
|customer_id|
+-----------+
+-----------+

Customers who ordered AND were active (using intersect):
+-----------+
|customer_id|
+-----------+
|       C003|
|       C005|
|       C001|
|       C002|
+-----------+


--- Join Operations (Operating on columns based on keys) ---
Inner Join: Combining customer and order details for matching customer_ids:
+-----------+--------+----------+----------+---------+------+-----+----+---------+--------------------+
|customer_id|order_id|product_id|order_date|   status|amount| name| age|     city|           interests|
+-----------+--------+----------+----------+---------+------+-----+----+---------+--------------------+
|       C003|    O003|      P004|2024-01-06|Delivered| 45000| Aman|NULL|   Mumbai|[[Home,  Electron...|
|       C001|    O001|      P001|2024-01-05|Delivered| 55000|Rahul|  29|Bangalore|[Electronics, Fas...|
|       C002|    O

PART H — DAG & PERFORMANCE ANALYSIS

28. Run explain(True) for:
- Product → Seller join
- Window ranking
- Sorting
29. Identify:
- Shuffles
- Broadcast joins
Sort stages
30. Suggest one performance improvement

In [30]:
#28
products_seller_df.explain(True)
customer_spend_rank_df.explain(True)
sorted_categories_by_revenue_df.explain(True)

== Parsed Logical Plan ==
'Join UsingJoin(Inner, [seller_id])
:- Project [product_id#9, product#10, category#11, seller_id#12, cast(price#42 as int) AS price#43]
:  +- Project [product_id#9, product#10, category#11, seller_id#12, start_date#13 AS price#42]
:     +- LogicalRDD [product_id#9, product#10, category#11, seller_id#12, start_date#13], false
+- ResolvedHint (strategy=broadcast)
   +- Project [seller_id#5, seller_name#6, category#7, coalesce(to_date(try_to_timestamp(start_date#8, Some(yyyy-MM-dd), TimestampType, Some(Etc/UTC), false), None, Some(Etc/UTC), true), to_date(try_to_timestamp(start_date#8, Some(dd/MM/yyyy), TimestampType, Some(Etc/UTC), false), None, Some(Etc/UTC), true), to_date(try_to_timestamp(start_date#8, Some(yyyy/MM/dd), TimestampType, Some(Etc/UTC), false), None, Some(Etc/UTC), true)) AS start_date#83]
      +- LogicalRDD [seller_id#5, seller_name#6, category#7, start_date#8], false

== Analyzed Logical Plan ==
seller_id: string, product_id: string, product: 

In [31]:
#29
#Shuffles: GroupBy, sort
#Broadcast joins: Seller join
#Sort stages: Window+OrderBy

In [33]:
#30
# Performance Improvement Suggestion:

orders_products_df.cache()
# Cache the 'orders_products_df' DataFrame.
# This DataFrame is the result of a join and is used multiple times in subsequent calculations
# (e.g., total revenue per category/seller, running revenue, top products).
# Caching it will prevent Spark from recomputing this DataFrame every time it's accessed.

DataFrame[product_id: string, order_id: string, customer_id: string, order_date: date, status: string, amount: string, product: string, category: string, seller_id: string, price: int]