In [0]:
from pyspark.sql.functions import *
#  PySpark + Delta
#  1. Ingest all 3 CSVs as Delta Tables.

ordersdf = spark.read.option("header", True).csv("file:/Workspace/Shared/orders.csv")
customersdf = spark.read.option("header", True).csv("file:/Workspace/Shared/customers.csv")
productsdf = spark.read.option("header", True).csv("file:/Workspace/Shared/products.csv")

ordersdf.write.mode("overwrite").format("delta").save("file:/Workspace/Shared/orders")
customersdf.write.mode("overwrite").format("delta").save("file:/Workspace/Shared/customers")
productsdf.write.mode("overwrite").format("delta").save("file:/Workspace/Shared/products")

#  2. Write SQL to get the total revenue per Product.
orddelta = spark.read.format("delta").load("file:/Workspace/Shared/orders")
revenue_per_product = orddelta.filter(col("Status") == "Delivered").withColumn("Revenue", col("Quantity") * col("Price")).groupBy("ProductID").agg(_sum("Revenue").alias("TotalRevenue"))

revenue_per_product.show()

#  3. Join Orders + Customers to find revenue by Region.

customers_delta = spark.read.format("delta").load("file:/Workspace/Shared/customers")
orders_with_customers = orders_delta.join(customers_delta, "CustomerID")

revenue_by_region = orders_with_customers.filter(col("Status") == "Delivered").withColumn("Revenue", col("Quantity") * col("Price")).groupBy("Region").agg(_sum("Revenue").alias("TotalRevenue"))

revenue_by_region.show()

#  4. Update the Status of Pending orders to 'Cancelled'.

from delta.tables import DeltaTable

delta_orders = DeltaTable.forPath(spark, "file:/Workspace/Shared/orders")
delta_orders.update(
    condition = col("Status") == "Pending",
    set = {"Status": "Cancelled"}
)

#  5. Merge a new return record into Orders.
from pyspark.sql import Row
new_return = [Row(OrderID="3006", CustomerID="C002", ProductID="P1002", Quantity=1, Price=50000,
                  OrderDate="2024-05-06", Status="Returned")]
new_return_df = spark.createDataFrame(new_return)

delta_orders.alias("orders").merge(new_return_df.alias("new"),"orders.OrderID = new.OrderID").whenNotMatchedInsertAll().execute()

#  DLT Pipeline
#  6. Create raw → cleaned → aggregated tables:
#  Clean: Remove rows with NULLs
#  Aggregated: Total revenue per Category
cleaned_orders = orders_delta.dropna()


products_delta = spark.read.format("delta").load("file:/Workspace/Shared/products")
joined = cleaned_orders.filter(col("Status") == "Delivered").withColumn("Revenue", col("Quantity") * col("Price")).join(products_delta, "ProductID")

revenue_per_category = joined.groupBy("Category").agg(_sum("Revenue").alias("TotalRevenue"))
revenue_per_category.show()

#  Time Travel
#  7. View data before the Status update.
#  8. Restore to an older version of the orders table.
#  Vacuum + Retention

spark.read.format("delta").option("versionAsOf", 0).load("file:/Workspace/Shared/orders").show()

# 8. Restore Old Version
delta_orders.restoreToVersion(0) 

# 9. Run 
# VACUUM after changing default retention.
#  Expectations

spark.sql("SET spark.databricks.delta.retentionDurationCheck.enabled = false")
delta_orders.vacuum(0)

#  10. 
# Quantity > 0 , 
# Price > 0 , 
# Bonus

quality_checked = orders_delta.filter((col("Quantity") > 0) & (col("Price") > 0) & (col("OrderDate").isNotNull()))

#  11. Use 
# OrderDate is not null
#  when-otherwise to create a new column: 
# 'Returned
final_orders = quality_checked.withColumn("OrderType",when(col("Status") == "Returned", "Return").otherwise("Regular"))

final_orders.select("OrderID", "Status", "OrderType").show()
