**intialize the Spark Session**

In [None]:
from pyspark.sql import SparkSession
spark = (
    SparkSession.builder\
    .appName("Customer-order-analysis")\
    .getOrCreate()
)

**Load the datasets**

In [None]:
customer=spark.read.csv("/content/customers.csv",header=True,inferSchema=True)
customer.show()
order=spark.read.csv("/content/orders.csv",header=True,inferSchema=True)
order.show()

+-----------+---------+--------------------+-------------+
|customer_id|     name|               email|       region|
+-----------+---------+--------------------+-------------+
|          1|    Aaron|     aaron@gmail.com|         East|
|          2|      Sam|       sam@gmail.com|         East|
|          3|    Choki|     choki@gmail.com|         East|
|          4|   Mickey|    mickey@gmail.com|         West|
|          5|    Tomie|     tomie@gmail.com|        North|
|          6|     dudu|      dudu@gmail.com|        South|
|          7|    samie|     samie@gmail.com|         West|
|          8|     bubu|      bubu@gmail.com|         West|
|          9|samyuktha|samyukatha@gmail.com|         East|
|         10|     love|      love@gmail.com|         East|
|         11|    Kiara|     kiara@gmail.com|South-Central|
|         12|      Sai|       sai@gmail.com|   North-East|
|         13|  Shaurya|   shaurya@gmail.com|South-Central|
|         14|      Ira|       ira@gmail.com|    East-Wes

**Data Preprocessing**

In [None]:
from pyspark.sql.functions import col, to_date
# Convert delivery_date and order_date to date type
orders_df = order.withColumn("order_date", to_date(col("order_date"))) \
                     .withColumn("delivery_date", to_date(col("delivery_date")))
# Cast customer_id to int for both tables
customers_df = customer.withColumn("customer_id", col("customer_id").cast("int"))
orders_df = orders_df.withColumn("customer_id", col("customer_id").cast("int"))

**Join customers and orders**

In [None]:
#Join customers and orders on customer_id
joined_df = orders_df.join(customers_df, on="customer_id", how="inner")
joined_df.show()

+-----------+--------+----------+-------------+----------------------+----------+----------+---------+--------------------+-------------+
|customer_id|order_id|order_date|delivery_date|expected_delivery_date|delay_days|    status|     name|               email|       region|
+-----------+--------+----------+-------------+----------------------+----------+----------+---------+--------------------+-------------+
|         33|    1001|2025-05-22|   2025-05-25|            2025-05-27|         0| Delivered|   Ishaan|    ishaan@gmail.com|   West-North|
|         35|    1002|2025-05-18|   2025-05-27|            2025-05-23|         4|In Transit|    Kiara|     kiara@gmail.com|   North-East|
|        100|    1003|2025-05-01|   2025-05-05|            2025-05-06|         0| Delivered| Reyanshi|  reyanshi@gmail.com|   North-East|
|         21|    1004|2025-05-24|   2025-05-25|            2025-05-29|         0| Delivered|  Reyansh|   reyansh@gmail.com| Central-East|
|        100|    1005|2025-05-07| 

**save the joined data**

In [None]:
joined_df.coalesce(1).write.option("header", True).csv("/content/customer_orders")

In [None]:
from pyspark.sql.functions import datediff
#Add delay column and Consider delay if delivery is more than 3 days after order
joined_df = joined_df.withColumn("delayed", (datediff(col("delivery_date"), col("order_date")) > 3))
joined_df.show()

+-----------+--------+----------+-------------+----------------------+----------+----------+---------+--------------------+-------------+-------+
|customer_id|order_id|order_date|delivery_date|expected_delivery_date|delay_days|    status|     name|               email|       region|delayed|
+-----------+--------+----------+-------------+----------------------+----------+----------+---------+--------------------+-------------+-------+
|         33|    1001|2025-05-22|   2025-05-25|            2025-05-27|         0| Delivered|   Ishaan|    ishaan@gmail.com|   West-North|  false|
|         35|    1002|2025-05-18|   2025-05-27|            2025-05-23|         4|In Transit|    Kiara|     kiara@gmail.com|   North-East|   true|
|        100|    1003|2025-05-01|   2025-05-05|            2025-05-06|         0| Delivered| Reyanshi|  reyanshi@gmail.com|   North-East|   true|
|         21|    1004|2025-05-24|   2025-05-25|            2025-05-29|         0| Delivered|  Reyansh|   reyansh@gmail.com| 

**Group by region and count delays**

In [None]:
from pyspark.sql import functions as F
result_df = joined_df.groupBy("region") \
                     .agg(F.sum(F.col("delayed").cast("int")).alias("delay_count"))
result_df.show()


+-------------+-----------+
|       region|delay_count|
+-------------+-----------+
|    East-West|         13|
|   North-East|          9|
|        South|          1|
|South-Central|         11|
|   West-North|         12|
|         East|          9|
|         West|          0|
| Central-East|         11|
+-------------+-----------+



**Save result to CSV**

In [None]:
result_df.coalesce(1).write.option("header", True).csv("/content/delay_by_region")