In [1]:

!pip install -q pyspark==3.5.1 delta-spark==3.1.0

from pyspark.sql import SparkSession
from delta import configure_spark_with_delta_pip

builder = SparkSession.builder \
    .appName("ECommerceDeltaLake") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()


In [4]:

orders_df = spark.read.csv("orders.csv", header=True, inferSchema=True)
customers_df = spark.read.csv("customers.csv", header=True, inferSchema=True)
products_df = spark.read.csv("products.csv", header=True, inferSchema=True)

orders_df.write.format("delta").mode("overwrite").save("/content/delta/orders")
customers_df.write.format("delta").mode("overwrite").save("/content/delta/customers")
products_df.write.format("delta").mode("overwrite").save("/content/delta/products")


In [5]:
# 1.Ingest all 3 CSVs as Delta Tables.
spark.read.format("delta").load("/content/delta/orders").createOrReplaceTempView("orders")

spark.sql("""
SELECT ProductID, SUM(Quantity * Price) AS TotalRevenue
FROM orders
WHERE Status = 'Delivered'
GROUP BY ProductID
""").show()


+---------+------------+
|ProductID|TotalRevenue|
+---------+------------+
|    P1001|       75000|
|    P1002|       50000|
|    P1003|       30000|
+---------+------------+



In [6]:
# 2. Write SQL to get the total revenue per Product.
spark.read.format("delta").load("/content/delta/orders").createOrReplaceTempView("orders")

spark.sql("""
SELECT ProductID, SUM(Quantity * Price) AS TotalRevenue
FROM orders
WHERE Status = 'Delivered'
GROUP BY ProductID
""").show()


+---------+------------+
|ProductID|TotalRevenue|
+---------+------------+
|    P1001|       75000|
|    P1002|       50000|
|    P1003|       30000|
+---------+------------+



In [7]:
# 3. Join Orders + Customers to find revenue by Region
orders_df = spark.read.format("delta").load("/content/delta/orders")
customers_df = spark.read.format("delta").load("/content/delta/customers")

orders_df.createOrReplaceTempView("orders")
customers_df.createOrReplaceTempView("customers")

spark.sql("""
SELECT c.Region, SUM(o.Quantity * o.Price) AS RegionalRevenue
FROM orders o
JOIN customers c ON o.CustomerID = c.CustomerID
WHERE o.Status = 'Delivered'
GROUP BY c.Region
""").show()


+------+---------------+
|Region|RegionalRevenue|
+------+---------------+
|  West|          30000|
| North|         125000|
+------+---------------+



In [8]:
# 4.Update the Status of Pending orders to 'Cancelled'
from delta.tables import DeltaTable

orders_delta = DeltaTable.forPath(spark, "/content/delta/orders")

orders_delta.update(
    condition="Status = 'Pending'",
    set={"Status": "'Cancelled'"}
)

orders_delta.toDF().show()


+-------+----------+---------+--------+-----+----------+---------+
|OrderID|CustomerID|ProductID|Quantity|Price| OrderDate|   Status|
+-------+----------+---------+--------+-----+----------+---------+
|   3001|      C001|    P1001|       1|75000|2024-05-01|Delivered|
|   3002|      C002|    P1002|       2|50000|2024-05-02| Returned|
|   3003|      C003|    P1003|       1|30000|2024-05-03|Delivered|
|   3004|      C001|    P1002|       1|50000|2024-05-04|Delivered|
|   3005|      C004|    P1004|       3|10000|2024-05-05|Cancelled|
+-------+----------+---------+--------+-----+----------+---------+



In [9]:
#5. Merge a new return record into Orders
from pyspark.sql.functions import lit

new_order = spark.createDataFrame([
    (3006, "C002", "P1002", 1, 50000, "2024-05-06", "Returned")
], ["OrderID", "CustomerID", "ProductID", "Quantity", "Price", "OrderDate", "Status"])

orders_delta.alias("target").merge(
    new_order.alias("source"),
    "target.OrderID = source.OrderID"
).whenNotMatchedInsertAll().execute()

orders_delta.toDF().show()


+-------+----------+---------+--------+-----+----------+---------+
|OrderID|CustomerID|ProductID|Quantity|Price| OrderDate|   Status|
+-------+----------+---------+--------+-----+----------+---------+
|   3001|      C001|    P1001|       1|75000|2024-05-01|Delivered|
|   3002|      C002|    P1002|       2|50000|2024-05-02| Returned|
|   3003|      C003|    P1003|       1|30000|2024-05-03|Delivered|
|   3004|      C001|    P1002|       1|50000|2024-05-04|Delivered|
|   3005|      C004|    P1004|       3|10000|2024-05-05|Cancelled|
|   3006|      C002|    P1002|       1|50000|2024-05-06| Returned|
+-------+----------+---------+--------+-----+----------+---------+



In [10]:
# 6.Create raw → cleaned → aggregated tables
raw_orders = spark.read.format("delta").load("/content/delta/orders")

cleaned_orders = raw_orders.dropna()
cleaned_orders.write.format("delta").mode("overwrite").save("/content/delta/cleaned_orders")


In [15]:
#7. View previous version
old_orders = spark.read.format("delta").option("versionAsOf", 0).load("/content/delta/orders")
old_orders.show()


+-------+----------+---------+--------+-----+----------+---------+
|OrderID|CustomerID|ProductID|Quantity|Price| OrderDate|   Status|
+-------+----------+---------+--------+-----+----------+---------+
|   3001|      C001|    P1001|       1|75000|2024-05-01|Delivered|
|   3002|      C002|    P1002|       2|50000|2024-05-02| Returned|
|   3003|      C003|    P1003|       1|30000|2024-05-03|Delivered|
|   3004|      C001|    P1002|       1|50000|2024-05-04|Delivered|
|   3005|      C004|    P1004|       3|10000|2024-05-05|  Pending|
+-------+----------+---------+--------+-----+----------+---------+



In [16]:
#8. restore to older versions
original_orders = spark.read.format("delta").option("versionAsOf", 0).load("/content/delta/orders")
original_orders.write.format("delta").mode("overwrite").save("/content/delta/orders")


In [19]:
# 9.
spark.conf.set("spark.databricks.delta.retentionDurationCheck.enabled", False)

from delta.tables import DeltaTable
orders_table = DeltaTable.forPath(spark, "/content/delta/orders")
orders_table.vacuum(0)


orders_table.toDF().show()

+-------+----------+---------+--------+-----+----------+---------+
|OrderID|CustomerID|ProductID|Quantity|Price| OrderDate|   Status|
+-------+----------+---------+--------+-----+----------+---------+
|   3001|      C001|    P1001|       1|75000|2024-05-01|Delivered|
|   3002|      C002|    P1002|       2|50000|2024-05-02| Returned|
|   3003|      C003|    P1003|       1|30000|2024-05-03|Delivered|
|   3004|      C001|    P1002|       1|50000|2024-05-04|Delivered|
|   3005|      C004|    P1004|       3|10000|2024-05-05|  Pending|
+-------+----------+---------+--------+-----+----------+---------+



In [20]:
# 10. Expectations: Quantity > 0, Price > 0, OrderDate not null
orders = spark.read.format("delta").load("/content/delta/orders")

valid_orders = orders.filter("Quantity > 0 AND Price > 0 AND OrderDate IS NOT NULL")
valid_orders.show()


+-------+----------+---------+--------+-----+----------+---------+
|OrderID|CustomerID|ProductID|Quantity|Price| OrderDate|   Status|
+-------+----------+---------+--------+-----+----------+---------+
|   3001|      C001|    P1001|       1|75000|2024-05-01|Delivered|
|   3002|      C002|    P1002|       2|50000|2024-05-02| Returned|
|   3003|      C003|    P1003|       1|30000|2024-05-03|Delivered|
|   3004|      C001|    P1002|       1|50000|2024-05-04|Delivered|
|   3005|      C004|    P1004|       3|10000|2024-05-05|  Pending|
+-------+----------+---------+--------+-----+----------+---------+



In [21]:
#  11. Bonus: Add OrderType column using when-otherwise
from pyspark.sql.functions import when

orders = spark.read.format("delta").load("/content/delta/orders")

orders_with_type = orders.withColumn("OrderType", when(orders.Status == "Returned", "Return").otherwise("Regular"))
orders_with_type.show()


+-------+----------+---------+--------+-----+----------+---------+---------+
|OrderID|CustomerID|ProductID|Quantity|Price| OrderDate|   Status|OrderType|
+-------+----------+---------+--------+-----+----------+---------+---------+
|   3001|      C001|    P1001|       1|75000|2024-05-01|Delivered|  Regular|
|   3002|      C002|    P1002|       2|50000|2024-05-02| Returned|   Return|
|   3003|      C003|    P1003|       1|30000|2024-05-03|Delivered|  Regular|
|   3004|      C001|    P1002|       1|50000|2024-05-04|Delivered|  Regular|
|   3005|      C004|    P1004|       3|10000|2024-05-05|  Pending|  Regular|
+-------+----------+---------+--------+-----+----------+---------+---------+

