In [None]:
!pip install pyspark==3.5.1



In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, datediff

# Initialize Spark session
spark = SparkSession.builder \
    .appName("SupplyChainDelayAnalysis") \
    .getOrCreate()

# Load the CSV file
df = spark.read.csv("/content/processed_supply_chain_report.csv", header=True, inferSchema=True)

# Convert dates to proper format if not already
df = df.withColumn("orderdate", col("orderdate").cast("date")) \
       .withColumn("deliverydate", col("deliverydate").cast("date"))

# Calculate delay in days
df = df.withColumn("delay_days", datediff(col("deliverydate"), col("orderdate")))

# Create a new column to indicate delay status
df = df.withColumn("is_delayed", when(col("delay_days") > 0, 1).otherwise(0))

# Filter delayed shipments
delayed_df = df.filter(col("delay_days") > 0)

# Show some delayed shipment results
print("Delayed Shipments:")
delayed_df.select("orderid", "itemid", "orderdate", "deliverydate", "delay_days", "is_delayed").show(10)

# Task 3: Group by supplierid and suppliername and count delayed orders
delayed_count_df = delayed_df.groupBy("supplierid", "suppliername").count()

print("Delayed orders count by supplier:")
delayed_count_df.show()

# Save the processed data with delay info
df.coalesce(1).write.csv("processed_supply_chain_report_with_delays", header=True, mode="overwrite")

# Save the aggregated delayed orders count
delayed_count_df.coalesce(1).write.csv("delayed_orders_count_by_supplier", header=True, mode="overwrite")

# Stop Spark session
spark.stop()


Delayed Shipments:
+-------+------+----------+------------+----------+----------+
|orderid|itemid| orderdate|deliverydate|delay_days|is_delayed|
+-------+------+----------+------------+----------+----------+
|      1|     1|2025-05-01|  2025-05-10|         9|         1|
|      2|     2|2025-05-03|  2025-05-08|         5|         1|
|      3|     3|2025-05-05|  2025-05-18|        13|         1|
|      4|     4|2025-05-06|  2025-05-12|         6|         1|
|      5|     5|2025-05-08|  2025-06-15|        38|         1|
|      6|     1|2025-05-10|  2025-05-22|        12|         1|
|      7|     6|2025-05-11|  2025-05-20|         9|         1|
|      8|     3|2025-05-12|  2025-05-16|         4|         1|
|      9|     6|2025-05-13|  2025-05-17|         4|         1|
|     11|     2|2025-06-01|  2025-06-10|         9|         1|
+-------+------+----------+------------+----------+----------+

Delayed orders count by supplier:
+----------+-----------------+-----+
|supplierid|     supplierna