In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("CustomerAnalysis-Pipeline") \
    .getOrCreate()
spark

**Upload CSV and Load Data**

In [3]:
df = spark.read.csv("/content/customer_orders.csv", header=True, inferSchema=True)
df.show()

+-----------+--------+----------+-------------+----------------------+----------+----------+---------+--------------------+-------------+-------+
|customer_id|order_id|order_date|delivery_date|expected_delivery_date|delay_days|    status|     name|               email|       region|delayed|
+-----------+--------+----------+-------------+----------------------+----------+----------+---------+--------------------+-------------+-------+
|         33|    1001|2025-05-22|   2025-05-25|            2025-05-27|         0| Delivered|   Ishaan|    ishaan@gmail.com|   West-North|  false|
|         35|    1002|2025-05-18|   2025-05-27|            2025-05-23|         4|In Transit|    Kiara|     kiara@gmail.com|   North-East|   true|
|        100|    1003|2025-05-01|   2025-05-05|            2025-05-06|         0| Delivered| Reyanshi|  reyanshi@gmail.com|   North-East|   true|
|         21|    1004|2025-05-24|   2025-05-25|            2025-05-29|         0| Delivered|  Reyansh|   reyansh@gmail.com| 

**Update delivery status**

In [4]:
from pyspark.sql.functions import when, col, current_date
updated_df = df.withColumn("delivery_status",
    when(col("delivery_date").isNull() & (col("expected_delivery_date") < current_date()), "Delayed")
    .when(col("delivery_date").isNotNull(), "Delivered")
    .otherwise("Pending")
)
print("Updated DataFrame:")
updated_df.show()

Updated DataFrame:
+-----------+--------+----------+-------------+----------------------+----------+----------+---------+--------------------+-------------+-------+---------------+
|customer_id|order_id|order_date|delivery_date|expected_delivery_date|delay_days|    status|     name|               email|       region|delayed|delivery_status|
+-----------+--------+----------+-------------+----------------------+----------+----------+---------+--------------------+-------------+-------+---------------+
|         33|    1001|2025-05-22|   2025-05-25|            2025-05-27|         0| Delivered|   Ishaan|    ishaan@gmail.com|   West-North|  false|      Delivered|
|         35|    1002|2025-05-18|   2025-05-27|            2025-05-23|         4|In Transit|    Kiara|     kiara@gmail.com|   North-East|   true|      Delivered|
|        100|    1003|2025-05-01|   2025-05-05|            2025-05-06|         0| Delivered| Reyanshi|  reyanshi@gmail.com|   North-East|   true|      Delivered|
|        

**Save the results as Delta or CSV**

In [5]:
output_dir = "/content/customer_orders_updated"
output_file = "/content/customer_orders_updated.csv"
updated_df.coalesce(1).write.option("header", True).mode("overwrite").csv(output_dir)

**Sql Queries**

In [6]:
#create temp view
updated_df.createOrReplaceTempView("latest_orders")
#show top 5 delayed customers
top_delayed_customers = spark.sql("""
    SELECT customer_id, COUNT(*) AS delayed_orders
    FROM latest_orders
    WHERE delivery_status = 'Delayed'
    GROUP BY customer_id
    ORDER BY delayed_orders DESC
    LIMIT 5
""")
print("Top 5 delayed customers:")
top_delayed_customers.show()

Top 5 delayed customers:
+-----------+--------------+
|customer_id|delayed_orders|
+-----------+--------------+
+-----------+--------------+

