In [1]:
# Starting a spark session
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Customers").getOrCreate()

In [4]:
# Loading orders and delivery data into spark

order = spark.read.csv("orders.csv", header=True, inferSchema=True)
delivery = spark.read.csv("delivery_status.csv", header=True, inferSchema=True)

order.show()
delivery.show()

+--------+-----------+----------+------------+
|order_id|customer_id|order_date|total_amount|
+--------+-----------+----------+------------+
|       1|          1|01-08-2025|        2500|
|       2|          2|05-08-2025|        1800|
|       3|          3|02-08-2025|        3200|
|       4|          4|10-08-2025|        1450|
|       5|          5|15-08-2025|        2200|
|       6|          6|07-08-2025|        2750|
|       7|          7|18-08-2025|         900|
|       8|          8|12-08-2025|        1100|
+--------+-----------+----------+------------+

+-----------+--------+---------------+----------+
|delivery_id|order_id|delivery_status|updated_on|
+-----------+--------+---------------+----------+
|          1|       1|      DELIVERED|05-08-2025|
|          2|       2|     IN_TRANSIT|06-08-2025|
|          3|       3|      DELIVERED|03-08-2025|
|          4|       4|     IN_TRANSIT|11-08-2025|
|          5|       5|         PLACED|15-08-2025|
|          6|       6|      DELIVER

In [5]:
customer = spark.read.csv("Customers.csv", header=True, inferSchema=True)
customer.show()

+-----------+-----------------+--------------------+------+
|customer_id|             name|               email|region|
+-----------+-----------------+--------------------+------+
|          1|    Claire Dunphy|  claire@example.com|  East|
|          2|      Phil Dunphy|realtorphil@examp...|  East|
|          3|    Jay Prichette|     jay@example.com| North|
|          4| Gloria Prichette|  gloria@example.com| North|
|          5|Mitchel Prichette|lawyermitch@examp...| South|
|          6|   Cameron Tucker|     cam@example.com| South|
|          7|     Haley Dunphy|   haley@example.com|  West|
|          8|   Dylan Marshall|   dylan@example.com|  West|
+-----------+-----------------+--------------------+------+



In [8]:
# Joining order and customer
or_custom = order.join(customer, on="customer_id", how="inner")
or_custom.show()

+-----------+--------+----------+------------+-----------------+--------------------+------+
|customer_id|order_id|order_date|total_amount|             name|               email|region|
+-----------+--------+----------+------------+-----------------+--------------------+------+
|          1|       1|01-08-2025|        2500|    Claire Dunphy|  claire@example.com|  East|
|          2|       2|05-08-2025|        1800|      Phil Dunphy|realtorphil@examp...|  East|
|          3|       3|02-08-2025|        3200|    Jay Prichette|     jay@example.com| North|
|          4|       4|10-08-2025|        1450| Gloria Prichette|  gloria@example.com| North|
|          5|       5|15-08-2025|        2200|Mitchel Prichette|lawyermitch@examp...| South|
|          6|       6|07-08-2025|        2750|   Cameron Tucker|     cam@example.com| South|
|          7|       7|18-08-2025|         900|     Haley Dunphy|   haley@example.com|  West|
|          8|       8|12-08-2025|        1100|   Dylan Marshall|   dyl

In [9]:
# Joining all for region-wise delay
full = or_custom.join(delivery, on="order_id", how="inner")
full.show()

+--------+-----------+----------+------------+-----------------+--------------------+------+-----------+---------------+----------+
|order_id|customer_id|order_date|total_amount|             name|               email|region|delivery_id|delivery_status|updated_on|
+--------+-----------+----------+------------+-----------------+--------------------+------+-----------+---------------+----------+
|       1|          1|01-08-2025|        2500|    Claire Dunphy|  claire@example.com|  East|          1|      DELIVERED|05-08-2025|
|       2|          2|05-08-2025|        1800|      Phil Dunphy|realtorphil@examp...|  East|          2|     IN_TRANSIT|06-08-2025|
|       3|          3|02-08-2025|        3200|    Jay Prichette|     jay@example.com| North|          3|      DELIVERED|03-08-2025|
|       4|          4|10-08-2025|        1450| Gloria Prichette|  gloria@example.com| North|          4|     IN_TRANSIT|11-08-2025|
|       5|          5|15-08-2025|        2200|Mitchel Prichette|lawyermitch@

In [10]:
from pyspark.sql.functions import col, count
delay = full.filter(col("delivery_status") != "DELIVERED")
delay.show()

+--------+-----------+----------+------------+-----------------+--------------------+------+-----------+---------------+----------+
|order_id|customer_id|order_date|total_amount|             name|               email|region|delivery_id|delivery_status|updated_on|
+--------+-----------+----------+------------+-----------------+--------------------+------+-----------+---------------+----------+
|       2|          2|05-08-2025|        1800|      Phil Dunphy|realtorphil@examp...|  East|          2|     IN_TRANSIT|06-08-2025|
|       4|          4|10-08-2025|        1450| Gloria Prichette|  gloria@example.com| North|          4|     IN_TRANSIT|11-08-2025|
|       5|          5|15-08-2025|        2200|Mitchel Prichette|lawyermitch@examp...| South|          5|         PLACED|15-08-2025|
|       7|          7|18-08-2025|         900|     Haley Dunphy|   haley@example.com|  West|          7|         PLACED|18-08-2025|
|       8|          8|12-08-2025|        1100|   Dylan Marshall|   dylan@exa

In [11]:
delay_region = delay.groupBy("region").agg(count("*").alias("orders_delay"))
delay_region.show()

+------+------------+
|region|orders_delay|
+------+------------+
| South|           1|
|  East|           1|
|  West|           2|
| North|           1|
+------+------------+



In [12]:
delay_region.write.csv('delay_region.csv',header=True)

or_custom.write.csv("joined.csv", header=True)