In [1]:
!pip install delta-spark==3.2.0 -q
import pyspark
from delta import *
from pyspark.sql.functions import *

# Create a SparkSession with Delta Lake extensions
# The '.config(...)' lines are crucial for enabling Delta Lake's features
builder = pyspark.sql.SparkSession.builder.appName("DeltaTutorial") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

# Get or create the SparkSession
spark = configure_spark_with_delta_pip(builder).getOrCreate()


In [2]:
# Loading data
customer = spark.read.csv('Customers.csv', header=True, inferSchema=True)
customer.show()

+-----------+-----------------+--------------------+------+
|customer_id|             name|               email|region|
+-----------+-----------------+--------------------+------+
|          1|    Claire Dunphy|  claire@example.com|  East|
|          2|      Phil Dunphy|realtorphil@examp...|  East|
|          3|    Jay Prichette|     jay@example.com| North|
|          4| Gloria Prichette|  gloria@example.com| North|
|          5|Mitchel Prichette|lawyermitch@examp...| South|
|          6|   Cameron Tucker|     cam@example.com| South|
|          7|     Haley Dunphy|   haley@example.com|  West|
|          8|   Dylan Marshall|   dylan@example.com|  West|
+-----------+-----------------+--------------------+------+



In [3]:
order = spark.read.csv('orders.csv', header=True, inferSchema=True)
order.show()

+--------+-----------+----------+------------+
|order_id|customer_id|order_date|total_amount|
+--------+-----------+----------+------------+
|       1|          1|01-08-2025|        2500|
|       2|          2|05-08-2025|        1800|
|       3|          3|02-08-2025|        3200|
|       4|          4|10-08-2025|        1450|
|       5|          5|15-08-2025|        2200|
|       6|          6|07-08-2025|        2750|
|       7|          7|18-08-2025|         900|
|       8|          8|12-08-2025|        1100|
+--------+-----------+----------+------------+



In [4]:
delivery = spark.read.csv('delivery_status.csv', header=True, inferSchema=True)
delivery.show()

+-----------+--------+---------------+----------+
|delivery_id|order_id|delivery_status|updated_on|
+-----------+--------+---------------+----------+
|          1|       1|      DELIVERED|05-08-2025|
|          2|       2|     IN_TRANSIT|06-08-2025|
|          3|       3|      DELIVERED|03-08-2025|
|          4|       4|     IN_TRANSIT|11-08-2025|
|          5|       5|         PLACED|15-08-2025|
|          6|       6|      DELIVERED|08-08-2025|
|          7|       7|         PLACED|18-08-2025|
|          8|       8|     IN_TRANSIT|13-08-2025|
+-----------+--------+---------------+----------+



In [11]:
from pyspark.sql.window import Window
window_spec = Window.partitionBy("order_id").orderBy(desc("updated_on"))

latest_delivery = (delivery
    .withColumn("rn", row_number().over(window_spec))
    .filter(col("rn") == 1)
    .drop("rn"))
latest_delivery.show()

+-----------+--------+---------------+----------+
|delivery_id|order_id|delivery_status|updated_on|
+-----------+--------+---------------+----------+
|          1|       1|      DELIVERED|05-08-2025|
|          2|       2|     IN_TRANSIT|06-08-2025|
|          3|       3|      DELIVERED|03-08-2025|
|          4|       4|     IN_TRANSIT|11-08-2025|
|          5|       5|         PLACED|15-08-2025|
|          6|       6|      DELIVERED|08-08-2025|
|          7|       7|         PLACED|18-08-2025|
|          8|       8|     IN_TRANSIT|13-08-2025|
+-----------+--------+---------------+----------+



In [7]:
order_latest = order.join(latest_delivery, "order_id", "left")
order_latest.show()

+--------+-----------+----------+------------+-----------+---------------+----------+
|order_id|customer_id|order_date|total_amount|delivery_id|delivery_status|updated_on|
+--------+-----------+----------+------------+-----------+---------------+----------+
|       1|          1|01-08-2025|        2500|          1|      DELIVERED|05-08-2025|
|       2|          2|05-08-2025|        1800|          2|     IN_TRANSIT|06-08-2025|
|       3|          3|02-08-2025|        3200|          3|      DELIVERED|03-08-2025|
|       4|          4|10-08-2025|        1450|          4|     IN_TRANSIT|11-08-2025|
|       5|          5|15-08-2025|        2200|          5|         PLACED|15-08-2025|
|       6|          6|07-08-2025|        2750|          6|      DELIVERED|08-08-2025|
|       7|          7|18-08-2025|         900|          7|         PLACED|18-08-2025|
|       8|          8|12-08-2025|        1100|          8|     IN_TRANSIT|13-08-2025|
+--------+-----------+----------+------------+--------

In [9]:
#Saving to delta
order_latest.write.format("delta").mode("overwrite").save("/tmp/orders_with_latest_status_delta")

In [18]:
order_latest.write.mode('overwrite').csv('order_latest_csv', header=True)

In [10]:
type(order_latest)

In [12]:
order_latest.createOrReplaceTempView('order_latest')

In [16]:
customer.createOrReplaceTempView('customer')

In [17]:
top_delayed_customers = spark.sql("""
    SELECT c.name AS customer_name,
           COUNT(*) AS delayed_orders
    FROM order_latest o
    JOIN customer c ON o.customer_id = c.customer_id
    WHERE o.delivery_status != 'DELIVERED'
    GROUP BY c.name
    ORDER BY delayed_orders DESC
    LIMIT 5
""")

top_delayed_customers.show()


+-----------------+--------------+
|    customer_name|delayed_orders|
+-----------------+--------------+
|   Dylan Marshall|             1|
|Mitchel Prichette|             1|
|      Phil Dunphy|             1|
| Gloria Prichette|             1|
|     Haley Dunphy|             1|
+-----------------+--------------+

