**Initialize SparkSession**

In [None]:
!pip install findspark

Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl.metadata (352 bytes)
Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1


In [None]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("DatabricksEmulationInColab") \
    .getOrCreate()


**Upload CSV to Colab and Load Data**

In [None]:
df = spark.read.csv("/content/processed_supply_chain_report.csv", header=True, inferSchema=True)
df.show()

+-------+----------+-----------------+------+--------------+----------+------------+----------+----------+-------+
|orderid|supplierid|     suppliername|itemid|      itemname| orderdate|deliverydate|delay_days|is_delayed|delayed|
+-------+----------+-----------------+------+--------------+----------+------------+----------+----------+-------+
|      1|         1|         bubududu|     1|      Widget A|2025-05-01|  2025-05-10|         9|         1|      1|
|      2|         2|      Mickeychoki|     2|      Widget B|2025-05-03|  2025-05-08|         5|         1|      1|
|      3|         3|           tomie.|     3|      Gadget X|2025-05-05|  2025-05-18|        13|         1|      1|
|      4|         1|         bubududu|     4|        Part Z|2025-05-06|  2025-05-12|         6|         1|      1|
|      5|         2|      Mickeychoki|     5|Assembly Kit 9|2025-05-08|  2025-06-15|        38|         0|      1|
|      6|         4|LogiPro Solutions|     1|      Widget A|2025-05-10|  2025-05

**Clean and Filter Data**

In [None]:
from pyspark.sql.functions import col, when, datediff

df = df.withColumn("orderdate", col("orderdate").cast("date")) \
       .withColumn("deliverydate", col("deliverydate").cast("date"))

df = df.withColumn("delay_days", datediff(col("deliverydate"), col("orderdate")))
df = df.withColumn("is_delayed", when(col("delay_days") > 0, 1).otherwise(0))

# Filter delayed shipments only
delayed_df = df.filter(col("delay_days") > 0)

delayed_df.show(5)


+-------+----------+------------+------+--------------+----------+------------+----------+----------+-------+
|orderid|supplierid|suppliername|itemid|      itemname| orderdate|deliverydate|delay_days|is_delayed|delayed|
+-------+----------+------------+------+--------------+----------+------------+----------+----------+-------+
|      1|         1|    bubududu|     1|      Widget A|2025-05-01|  2025-05-10|         9|         1|      1|
|      2|         2| Mickeychoki|     2|      Widget B|2025-05-03|  2025-05-08|         5|         1|      1|
|      3|         3|      tomie.|     3|      Gadget X|2025-05-05|  2025-05-18|        13|         1|      1|
|      4|         1|    bubududu|     4|        Part Z|2025-05-06|  2025-05-12|         6|         1|      1|
|      5|         2| Mickeychoki|     5|Assembly Kit 9|2025-05-08|  2025-06-15|        38|         1|      1|
+-------+----------+------------+------+--------------+----------+------------+----------+----------+-------+
only showi

**Save Cleaned Data**

In [None]:
delayed_df.coalesce(1).write.csv("cleaned_delayed_orders", header=True, mode="overwrite")

**Run Basic Analysis Queries**

In [None]:
# Register your DataFrame as a temp SQL view
delayed_df.createOrReplaceTempView("delayed_orders")
# 1. Count total delayed orders
result1 = spark.sql("""
    SELECT COUNT(*) AS total_delayed_orders
    FROM delayed_orders
    WHERE is_delayed = 1
""")
result1.show()
# 2. Count delayed orders by each supplier
result2 = spark.sql("""
    SELECT supplierid, suppliername, COUNT(*) AS delayed_orders_count
    FROM delayed_orders
    WHERE is_delayed = 1
    GROUP BY supplierid, suppliername
    ORDER BY delayed_orders_count DESC
""")
result2.show()
# 3. Average delay days per supplier
result3 = spark.sql("""
    SELECT supplierid, suppliername, AVG(delay_days) AS avg_delay_days
    FROM delayed_orders
    WHERE is_delayed = 1
    GROUP BY supplierid, suppliername
    ORDER BY avg_delay_days DESC
""")
result3.show()
# 4. List all delayed orders with delay greater than 10 days
result4 = spark.sql("""
    SELECT orderid, suppliername, itemname, delay_days
    FROM delayed_orders
    WHERE delay_days > 10
    ORDER BY delay_days DESC
""")
result4.show()
# 5. Count delayed orders per month (based on orderdate)
result5 = spark.sql("""
    SELECT YEAR(orderdate) AS year, MONTH(orderdate) AS month, COUNT(*) AS delayed_count
    FROM delayed_orders
    WHERE is_delayed = 1
    GROUP BY YEAR(orderdate), MONTH(orderdate)
    ORDER BY year, month
""")
result5.show()
# 6. Suppliers with no delayed orders
result6 = spark.sql("""
    SELECT DISTINCT supplierid, suppliername
    FROM delayed_orders
    WHERE supplierid NOT IN (
      SELECT DISTINCT supplierid
      FROM delayed_orders
      WHERE is_delayed = 1
    )
""")
result6.show()
# 7. Top 5 items with the most delays
result7 = spark.sql("""
    SELECT itemid, itemname, COUNT(*) AS delay_count
    FROM delayed_orders
    WHERE is_delayed = 1
    GROUP BY itemid, itemname
    ORDER BY delay_count DESC
    LIMIT 5
""")
result7.show()
# 8. Maximum delay for each supplier
result8 = spark.sql("""
    SELECT supplierid, suppliername, MAX(delay_days) AS max_delay
    FROM delayed_orders
    GROUP BY supplierid, suppliername
    ORDER BY max_delay DESC
""")
result8.show()
# 9. Percentage of delayed orders per supplier
result9 = spark.sql("""
    SELECT
      supplierid,
      suppliername,
      COUNT(CASE WHEN is_delayed = 1 THEN 1 END) * 100.0 / COUNT(*) AS delay_percentage
    FROM delayed_orders
    GROUP BY supplierid, suppliername
    ORDER BY delay_percentage DESC
""")
result9.show()
# 10. Count of delayed vs non-delayed orders overall
result10 = spark.sql("""
    SELECT is_delayed, COUNT(*) AS order_count
    FROM delayed_orders
    GROUP BY is_delayed
""")
result10.show()

+--------------------+
|total_delayed_orders|
+--------------------+
|                  10|
+--------------------+

+----------+-----------------+--------------------+
|supplierid|     suppliername|delayed_orders_count|
+----------+-----------------+--------------------+
|         1|         bubududu|                   4|
|         2|      Mickeychoki|                   3|
|         3|           tomie.|                   2|
|         4|LogiPro Solutions|                   1|
+----------+-----------------+--------------------+

+----------+-----------------+------------------+
|supplierid|     suppliername|    avg_delay_days|
+----------+-----------------+------------------+
|         2|      Mickeychoki|15.666666666666666|
|         4|LogiPro Solutions|              12.0|
|         3|           tomie.|              11.0|
|         1|         bubududu|               7.0|
+----------+-----------------+------------------+

+-------+-----------------+--------------+----------+
|orderid|   