In [0]:
# Scenario 1: Inventory Alerting System
# Tasks:
# 1. Load the data using PySpark.
# 2. Create a new column NeedsReorder = StockQty < ReorderLevel .
# 3. Create a view of all items that need restocking.
# 4. Highlight warehouses with more than 2 such items.
from pyspark.sql.functions import *
df = spark.read.csv("file:/Workspace/Shared/inventory_supply19.csv",header=True,InferSchema=True)

df = df.withColumn("NeedsReorder", col("StockQty") < col("ReorderLevel"))

df.filter("NeedsReorder").createOrReplaceTempView("items_to_restock")

spark.sql(
    """
    SELECT Warehouse, COUNT(*) AS NeedsCount
    FROM items_to_restock
    GROUP BY Warehouse
    HAVING COUNT(*) > 2
    """
).show()


+---------+----------+
|Warehouse|NeedsCount|
+---------+----------+
+---------+----------+



In [0]:
# Scenario 2: Supplier Price Optimization
# Tasks:
# 1. Group items by Supplier and compute average price.
# 2. Find which suppliers offer items below average price in their category.
# 3. Tag suppliers with Good Deal if >50% of their items are below market average.
spark.sql("""
SELECT Supplier, AVG(UnitPrice) AS AvgPrice
FROM items_to_restock
GROUP BY Supplier
""").show()

spark.sql("""
SELECT d.Supplier, d.ItemID, d.UnitPrice
FROM items_to_restock d
JOIN sup_avg_price s ON d.Supplier = s.Supplier
WHERE d.UnitPrice < s.AvgPrice
""").show()

spark.sql("""
SELECT Supplier,
  AVG(CASE WHEN UnitPrice < (SELECT AVG(UnitPrice) FROM items_to_restock WHERE Category = d.Category) THEN 1 ELSE 0 END) > 0.5 AS GoodDeal
FROM items_to_restock d
GROUP BY Supplier
""").show()


+---------+--------+
| Supplier|AvgPrice|
+---------+--------+
|TechWorld| 70000.0|
|PrintFast|  8000.0|
| FreezeIt| 25000.0|
+---------+--------+

+--------+------+---------+
|Supplier|ItemID|UnitPrice|
+--------+------+---------+
+--------+------+---------+

+---------+--------+
| Supplier|GoodDeal|
+---------+--------+
|TechWorld|   false|
|PrintFast|    true|
| FreezeIt|   false|
+---------+--------+



In [0]:
# Scenario 3: Cost Forecasting
# Tasks:
# 1. Calculate TotalStockValue = StockQty * UnitPrice .
# 2. Identify top 3 highest-value items.
# 3. Export the result as a Parquet file partitioned by Warehouse
df = df.withColumn("TotalStockValue", col("StockQty") * col("UnitPrice"))
df.orderBy(col("TotalStockValue").desc()).limit(3).show()

df.write.mode("overwrite").partitionBy("Warehouse").parquet("file:/Workspace/Shared/top_stock_value")


+------+------------+-----------+----------+--------+------------+-------------+---------+---------+------------+---------------+
|ItemID|    ItemName|   Category| Warehouse|StockQty|ReorderLevel|LastRestocked|UnitPrice| Supplier|NeedsReorder|TotalStockValue|
+------+------------+-----------+----------+--------+------------+-------------+---------+---------+------------+---------------+
|  I001|      LED TV|Electronics|WarehouseA|      50|          20|   2024-03-15|  30000.0|   AVTech|       false|      1500000.0|
|  I002|      Laptop|Electronics|WarehouseB|      10|          15|   2024-04-01|  70000.0|TechWorld|        true|       700000.0|
|  I003|Office Chair|  Furniture|WarehouseA|      40|          10|   2024-03-25|   6000.0|  ChairCo|       false|       240000.0|
+------+------------+-----------+----------+--------+------------+-------------+---------+---------+------------+---------------+



In [0]:
# Scenario 4: Warehouse Utilization
# Tasks:
# 1. Count items stored per warehouse.
# 2. Average stock per category in each warehouse.
# 3. Determine underutilized warehouses ( total stock < 100 ).

df.groupBy("Warehouse").count().show()

df.groupBy("Warehouse", "Category").avg("StockQty").show()

df.groupBy("Warehouse").sum("StockQty").filter("sum(StockQty) < 100").show()


+----------+-----+
| Warehouse|count|
+----------+-----+
|WarehouseA|    2|
|WarehouseC|    1|
|WarehouseB|    2|
+----------+-----+

+----------+-----------+-------------+
| Warehouse|   Category|avg(StockQty)|
+----------+-----------+-------------+
|WarehouseB|Electronics|          6.5|
|WarehouseA|  Furniture|         40.0|
|WarehouseC| Appliances|          5.0|
|WarehouseA|Electronics|         50.0|
+----------+-----------+-------------+

+----------+-------------+
| Warehouse|sum(StockQty)|
+----------+-------------+
|WarehouseA|           90|
|WarehouseC|            5|
|WarehouseB|           13|
+----------+-------------+



In [0]:
# Scenario 5: Delta Audit Trail
# Tasks:
# 1. Save as Delta table retail_inventory .
# 2. Update stock of 'Laptop' to 20.
# 3. Delete any item with StockQty = 0 .
# 4. Run DESCRIBE HISTORY and query VERSION AS OF previous state.
from delta.tables import DeltaTable

df.write.format("delta").mode("overwrite").save("file:/Worksapce/Shared/retail_inventory")
DeltaTable.forPath(spark, "/delta/retail_inventory")

DeltaTable.forPath(spark, "/delta/retail_inventory").update(
    condition=col("ItemName") == "Laptop", set={"StockQty": "20"}
)

DeltaTable.forPath(spark, "/delta/retail_inventory").delete("StockQty = 0")

spark.sql("DESCRIBE HISTORY delta.`/delta/retail_inventory`").show()
spark.read.format("delta").option("versionAsOf", 0).load("/delta/retail_inventory").show()


+-------+-------------------+----------------+--------------------+---------+--------------------+----+------------------+--------------------+-----------+-----------------+-------------+--------------------+------------+--------------------+
|version|          timestamp|          userId|            userName|operation| operationParameters| job|          notebook|           clusterId|readVersion|   isolationLevel|isBlindAppend|    operationMetrics|userMetadata|          engineInfo|
+-------+-------------------+----------------+--------------------+---------+--------------------+----+------------------+--------------------+-----------+-----------------+-------------+--------------------+------------+--------------------+
|      3|2025-06-19 05:15:55|1679761755594499|azuser3546_mml.lo...| OPTIMIZE|{predicate -> [],...|NULL|{2977741827703191}|0612-091342-i15khidz|          1|SnapshotIsolation|        false|{numRemovedFiles ...|        NULL|Databricks-Runtim...|
|      2|2025-06-19 05:15:53

In [0]:
# Scenario 6: Alerts from Restock Logs (Join Task)
# restock_logs.csv :
# ItemID,RestockDate,QuantityAdded
# I002,2024-04-20,10
# I005,2024-04-22,5
# I001,2024-04-25,20
# Tasks:
# 1. Join with inventory table to update StockQty.
# 2. Calculate new stock and flag RestockedRecently = true for updated items.
# 3. Use MERGE INTO to update in Delta.

logs = spark.read.csv("file:/Workspace/Shared/restock_logs19.csv",header=True,InferSchema=True)

joined = logs.join(df, "ItemID")
updates = joined.withColumn("NewStock", col("StockQty") + col("QuantityAdded")).withColumn("RestockedRecently", lit(True))

DeltaTable.forPath(spark, "/delta/retail_inventory").alias("inv").merge(updates.alias("update"),"inv.ItemID = update.ItemID").whenMatchedUpdate(set={
  "StockQty": "update.NewStock",
  "LastRestocked": "update.RestockDate"
}).execute()


In [0]:
# Scenario 7: Report Generation with SQL Views
# Tasks:
# 1. Create SQL view inventory_summary with:
# ItemName, Category, StockQty, NeedsReorder, TotalStockValue
# 2. Create view supplier_leaderboard sorted by average price


df.createOrReplaceTempView("inventory")

spark.sql("""
    CREATE OR REPLACE TEMP VIEW inventory_summary AS
    SELECT
        ItemName,
        Category,
        StockQty,
        CASE
            WHEN StockQty < ReorderLevel THEN true
            ELSE false
        END AS NeedsReorder,
        StockQty * UnitPrice AS TotalStockValue
    FROM inventory
""")

spark.sql("""
    CREATE OR REPLACE TEMP VIEW supplier_leaderboard AS
    SELECT
        Supplier,
        ROUND(AVG(UnitPrice), 2) AS AvgPrice
    FROM inventory
    GROUP BY Supplier
    ORDER BY AvgPrice
""")

spark.sql("SELECT * FROM inventory_summary").show()
spark.sql("SELECT * FROM supplier_leaderboard").show()


+------------+-----------+--------+------------+---------------+
|    ItemName|   Category|StockQty|NeedsReorder|TotalStockValue|
+------------+-----------+--------+------------+---------------+
|      LED TV|Electronics|      50|       false|        1500000|
|      Laptop|Electronics|      10|        true|         700000|
|Office Chair|  Furniture|      40|       false|         240000|
|Refrigerator| Appliances|       5|        true|         125000|
|     Printer|Electronics|       3|        true|          24000|
+------------+-----------+--------+------------+---------------+

+---------+--------+
| Supplier|AvgPrice|
+---------+--------+
|  ChairCo|  6000.0|
|PrintFast|  8000.0|
| FreezeIt| 25000.0|
|   AVTech| 30000.0|
|TechWorld| 70000.0|
+---------+--------+



In [0]:
# Scenario 8: Advanced Filtering
# Tasks:
# 1. Use when / otherwise to categorize items:
# "Overstocked" (>2x ReorderLevel)
# "LowStock"
# 2. Use .filter() and .where() for the same and compare.
df2 = df.withColumn("StockCategory",when(col("StockQty") > 2*col("ReorderLevel"), "Overstocked").when(col("StockQty") < col("ReorderLevel"), "LowStock").otherwise("Normal"))

df2.filter("StockCategory = 'LowStock'").show()
df2.where(col("StockCategory") == "LowStock").show()


+------+------------+-----------+----------+--------+------------+-------------+---------+---------+------------+---------------+-------------+
|ItemID|    ItemName|   Category| Warehouse|StockQty|ReorderLevel|LastRestocked|UnitPrice| Supplier|NeedsReorder|TotalStockValue|StockCategory|
+------+------------+-----------+----------+--------+------------+-------------+---------+---------+------------+---------------+-------------+
|  I002|      Laptop|Electronics|WarehouseB|      10|          15|   2024-04-01|  70000.0|TechWorld|        true|       700000.0|     LowStock|
|  I004|Refrigerator| Appliances|WarehouseC|       5|          10|   2024-02-20|  25000.0| FreezeIt|        true|       125000.0|     LowStock|
|  I005|     Printer|Electronics|WarehouseB|       3|           5|   2024-03-30|   8000.0|PrintFast|        true|        24000.0|     LowStock|
+------+------------+-----------+----------+--------+------------+-------------+---------+---------+------------+---------------+-------

In [0]:
# Scenario 9: Feature Engineering
# Tasks:
# 1. Extract RestockMonth from LastRestocked .
# 2. Create feature: StockAge = CURRENT_DATE - LastRestocked
# 3. Bucket StockAge into: New, Moderate, Stale

df = df.withColumn("RestockMonth", month("LastRestocked"))
df = df.withColumn("StockAge", datediff(current_date(), col("LastRestocked")))
df = df.withColumn("StockAgeType",when(col("StockAge") < 30, "New").when(col("StockAge") < 90, "Moderate").otherwise("Stale"))


In [0]:
# Scenario 10: Export Options
# Tasks:
# 1. Write full DataFrame to:
# CSV for analysts
# JSON for integration
# Delta for pipelines
# 2. Save with meaningful file and partition names like
# /export/inventory/stale_items/
df.write.mode("overwrite").csv("file:/Workspace/Shared/inventory_full", header=True)
df.write.mode("overwrite").json("file:/Workspace/Shared/inventory_full_json")
df.write.mode("overwrite").format("delta").save("file:/Workspace/Shared/inventory_delta")

df.write.mode("overwrite").format("delta").partitionBy("Category").save("file:/Workspace/Shared/inventory_delta2")
