In [1]:
# Creating spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Supply").getOrCreate()
spark

In [2]:
# Reading data from csv
order = spark.read.csv('orders.csv', header=True, inferSchema=True)
order.show()

+--------+----------+---------+--------------------+----------+-------------+--------+
|order_id|  customer| supplier|                item|order_date|delivery_date|quantity|
+--------+----------+---------+--------------------+----------+-------------+--------+
|       1|   Sheldon|SupplierA|          Whiteboard|05-01-2025|   10-01-2025|       2|
|       2|   Leonard|SupplierB|       Laser Tag Set|07-01-2025|   28-01-2025|       1|
|       3|     Penny|SupplierC|Cheesecake Factor...|10-01-2025|   15-01-2025|       5|
|       4|    Howard|SupplierA|        Space Helmet|12-01-2025|   20-02-2025|       1|
|       5|       Raj|SupplierB| Star Wars Figurines|15-01-2025|   22-01-2025|       3|
|       6|       Amy|SupplierC|      Brain Scan Kit|18-01-2025|   25-01-2025|       1|
|       7|Bernadette|SupplierA| Miniature Telescope|20-01-2025|   15-02-2025|       2|
|       8|    Stuart|SupplierB|         Comic Books|22-01-2025|   01-03-2025|      10|
|       9|   Sheldon|SupplierC|           T

In [3]:
delivery = spark.read.csv('delivery_status.csv', header=True, inferSchema=True)
delivery.show()

+--------+---------------+----------+--------------------+
|order_id|delivery_status|delay_days|               issue|
+--------+---------------+----------+--------------------+
|       1|        On Time|         5|                None|
|       2|        Delayed|        21|    Supplier backlog|
|       3|        On Time|         5|                None|
|       4|        Delayed|        39|Space helmet impo...|
|       5|        On Time|         7|                None|
|       6|        On Time|         7|                None|
|       7|        Delayed|        26|Supplier capacity...|
|       8|        Delayed|        38|Comic book restoc...|
|       9|        On Time|         5|                None|
|      10|        Delayed|      NULL|" ""Missing deliv...|
+--------+---------------+----------+--------------------+



In [5]:
supplier = spark.read.csv('suppliers.csv', header=True, inferSchema=True)
supplier.show()


+-----------+-------------+--------------------+
|supplier_id|supplier_name|            location|
+-----------+-------------+--------------------+
|          1|    SupplierA|Pasadena Electronics|
|          2|    SupplierB|      Nerdvana Games|
|          3|    SupplierC|Cheesecake Factor...|
+-----------+-------------+--------------------+



In [6]:
or_del = order.join(delivery, on='order_id', how='inner')
or_del.show()

+--------+----------+---------+--------------------+----------+-------------+--------+---------------+----------+--------------------+
|order_id|  customer| supplier|                item|order_date|delivery_date|quantity|delivery_status|delay_days|               issue|
+--------+----------+---------+--------------------+----------+-------------+--------+---------------+----------+--------------------+
|       1|   Sheldon|SupplierA|          Whiteboard|05-01-2025|   10-01-2025|       2|        On Time|         5|                None|
|       2|   Leonard|SupplierB|       Laser Tag Set|07-01-2025|   28-01-2025|       1|        Delayed|        21|    Supplier backlog|
|       3|     Penny|SupplierC|Cheesecake Factor...|10-01-2025|   15-01-2025|       5|        On Time|         5|                None|
|       4|    Howard|SupplierA|        Space Helmet|12-01-2025|   20-02-2025|       1|        Delayed|        39|Space helmet impo...|
|       5|       Raj|SupplierB| Star Wars Figurines|15-

In [8]:
# Filtering delayed orders
from pyspark.sql.functions import col, count
delay = or_del.filter(col('delivery_status') == 'Delayed')
delay.show()

+--------+----------+---------+-------------------+----------+-------------+--------+---------------+----------+--------------------+
|order_id|  customer| supplier|               item|order_date|delivery_date|quantity|delivery_status|delay_days|               issue|
+--------+----------+---------+-------------------+----------+-------------+--------+---------------+----------+--------------------+
|       2|   Leonard|SupplierB|      Laser Tag Set|07-01-2025|   28-01-2025|       1|        Delayed|        21|    Supplier backlog|
|       4|    Howard|SupplierA|       Space Helmet|12-01-2025|   20-02-2025|       1|        Delayed|        39|Space helmet impo...|
|       7|Bernadette|SupplierA|Miniature Telescope|20-01-2025|   15-02-2025|       2|        Delayed|        26|Supplier capacity...|
|       8|    Stuart|SupplierB|        Comic Books|22-01-2025|   01-03-2025|      10|        Delayed|        38|Comic book restoc...|
|      10|   Leonard|SupplierA|        Wifi Router|28-01-2025|

In [9]:
# Grouping by supplier and counting delay orders
grouped = delay.groupBy('supplier').count().withColumnRenamed('count', 'delay_orders')
grouped.show()

+---------+------------+
| supplier|delay_orders|
+---------+------------+
|SupplierA|           3|
|SupplierB|           2|
+---------+------------+



In [10]:
grouped.write.csv('grouped_data.csv', header=True)