In [0]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.getOrCreate()
spark

Scenario 1: Inventory Alerting System

In [0]:
# Load the data using PySpark.
df=spark.read.option("header",True).option("inferSchema",True).csv("file:/Workspace/Shared/inventory_supply.csv")
df.printSchema()
df.show()

root
 |-- ItemID: string (nullable = true)
 |-- ItemName: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Warehouse: string (nullable = true)
 |-- StockQty: integer (nullable = true)
 |-- ReorderLevel: integer (nullable = true)
 |-- LastRestocked: date (nullable = true)
 |-- UnitPrice: integer (nullable = true)
 |-- Supplier: string (nullable = true)

+------+------------+-----------+----------+--------+------------+-------------+---------+---------+
|ItemID|    ItemName|   Category| Warehouse|StockQty|ReorderLevel|LastRestocked|UnitPrice| Supplier|
+------+------------+-----------+----------+--------+------------+-------------+---------+---------+
|  I001|      LED TV|Electronics|WarehouseA|      50|          20|   2024-03-15|    30000|   AVTech|
|  I002|      Laptop|Electronics|WarehouseB|      10|          15|   2024-04-01|    70000|TechWorld|
|  I003|Office Chair|  Furniture|WarehouseA|      40|          10|   2024-03-25|     6000|  ChairCo|
|  I004|Refrigerat

In [0]:
# Create a new column NeedsReorder = StockQty < ReorderLevel .
from pyspark.sql.functions import col
df=df.withColumn("NeedsReorder", col("StockQty") < col("ReorderLevel"))
df.show()

+------+------------+-----------+----------+--------+------------+-------------+---------+---------+------------+
|ItemID|    ItemName|   Category| Warehouse|StockQty|ReorderLevel|LastRestocked|UnitPrice| Supplier|NeedsReorder|
+------+------------+-----------+----------+--------+------------+-------------+---------+---------+------------+
|  I001|      LED TV|Electronics|WarehouseA|      50|          20|   2024-03-15|    30000|   AVTech|       false|
|  I002|      Laptop|Electronics|WarehouseB|      10|          15|   2024-04-01|    70000|TechWorld|        true|
|  I003|Office Chair|  Furniture|WarehouseA|      40|          10|   2024-03-25|     6000|  ChairCo|       false|
|  I004|Refrigerator| Appliances|WarehouseC|       5|          10|   2024-02-20|    25000| FreezeIt|        true|
|  I005|     Printer|Electronics|WarehouseB|       3|           5|   2024-03-30|     8000|PrintFast|        true|
+------+------------+-----------+----------+--------+------------+-------------+--------

In [0]:
# Create a view of all items that need restocking.
df.createOrReplaceTempView("needs_reorder")

In [0]:
# Highlight warehouses with more than 2 such items.
spark.sql("SELECT * FROM needs_reorder WHERE NeedsReorder = true").show()
spark.sql("""SELECT Warehouse, COUNT(*) AS ItemsforReorder
FROM needs_reorder
WHERE NeedsReorder = true
GROUP BY Warehouse
HAVING COUNT(*) > 2""").show()

+------+------------+-----------+----------+--------+------------+-------------+---------+---------+------------+
|ItemID|    ItemName|   Category| Warehouse|StockQty|ReorderLevel|LastRestocked|UnitPrice| Supplier|NeedsReorder|
+------+------------+-----------+----------+--------+------------+-------------+---------+---------+------------+
|  I002|      Laptop|Electronics|WarehouseB|      10|          15|   2024-04-01|    70000|TechWorld|        true|
|  I004|Refrigerator| Appliances|WarehouseC|       5|          10|   2024-02-20|    25000| FreezeIt|        true|
|  I005|     Printer|Electronics|WarehouseB|       3|           5|   2024-03-30|     8000|PrintFast|        true|
+------+------------+-----------+----------+--------+------------+-------------+---------+---------+------------+

+---------+---------------+
|Warehouse|ItemsforReorder|
+---------+---------------+
+---------+---------------+



Scenario 2: Supplier Price Optimization

In [0]:
# Group items by Supplier and compute average price.
from pyspark.sql.functions import avg, count, sum
supplier_avg = df.groupBy("Supplier").agg(avg("UnitPrice").alias("AvgPriceBySupplier"))
sup_df=df.join(supplier_avg, "Supplier")
market_avg = df.groupBy("Category").agg(avg("UnitPrice").alias("AvgPriceByCategory"))
sup_df= sup_df.join(market_avg, "Category")
sup_df.show()

+-----------+---------+------+------------+----------+--------+------------+-------------+---------+------------+------------------+------------------+
|   Category| Supplier|ItemID|    ItemName| Warehouse|StockQty|ReorderLevel|LastRestocked|UnitPrice|NeedsReorder|AvgPriceBySupplier|AvgPriceByCategory|
+-----------+---------+------+------------+----------+--------+------------+-------------+---------+------------+------------------+------------------+
|Electronics|   AVTech|  I001|      LED TV|WarehouseA|      50|          20|   2024-03-15|    30000|       false|           30000.0|           36000.0|
|Electronics|TechWorld|  I002|      Laptop|WarehouseB|      10|          15|   2024-04-01|    70000|        true|           70000.0|           36000.0|
|  Furniture|  ChairCo|  I003|Office Chair|WarehouseA|      40|          10|   2024-03-25|     6000|       false|            6000.0|            6000.0|
| Appliances| FreezeIt|  I004|Refrigerator|WarehouseC|       5|          10|   2024-02-2

In [0]:
# Find which suppliers offer items below average price in their category.
sup_df=sup_df.withColumn("BelowMarket", col("UnitPrice") < col("AvgPriceByCategory"))
sup_df.show()

+-----------+---------+------+------------+----------+--------+------------+-------------+---------+------------+------------------+------------------+-----------+
|   Category| Supplier|ItemID|    ItemName| Warehouse|StockQty|ReorderLevel|LastRestocked|UnitPrice|NeedsReorder|AvgPriceBySupplier|AvgPriceByCategory|BelowMarket|
+-----------+---------+------+------------+----------+--------+------------+-------------+---------+------------+------------------+------------------+-----------+
|Electronics|   AVTech|  I001|      LED TV|WarehouseA|      50|          20|   2024-03-15|    30000|       false|           30000.0|           36000.0|       true|
|Electronics|TechWorld|  I002|      Laptop|WarehouseB|      10|          15|   2024-04-01|    70000|        true|           70000.0|           36000.0|      false|
|Electronics|PrintFast|  I005|     Printer|WarehouseB|       3|           5|   2024-03-30|     8000|        true|            8000.0|           36000.0|       true|
| Appliances| Fr

In [0]:
#Tag suppliers with Good Deal if >50% of their items are below market average.
score = sup_df.groupBy("Supplier").agg(
    (sum(col("BelowMarket").cast("int")) / count("*")).alias("BelowPct")
)
gooddeal_df = score.withColumn("GoodDeal", col("BelowPct") > 0.5)
gooddeal_df.show()

+---------+--------+--------+
| Supplier|BelowPct|GoodDeal|
+---------+--------+--------+
|   AVTech|     1.0|    true|
|TechWorld|     0.0|   false|
|PrintFast|     1.0|    true|
| FreezeIt|     0.0|   false|
|  ChairCo|     0.0|   false|
+---------+--------+--------+



Scenario 3: Cost Forecasting

In [0]:
# Calculate TotalStockValue = StockQty * UnitPrice .
df = df.withColumn("TotalStockValue", col("StockQty") * col("UnitPrice"))
df.groupBy("Warehouse").agg(sum("TotalStockValue").alias("TotalStockValue")).show()

+----------+---------------+
| Warehouse|TotalStockValue|
+----------+---------------+
|WarehouseA|        1740000|
|WarehouseC|         125000|
|WarehouseB|         724000|
+----------+---------------+



In [0]:
# Identify top 3 highest-value items.
df.orderBy(col("TotalStockValue").desc()).limit(3).show()

+------+------------+-----------+----------+--------+------------+-------------+---------+---------+------------+---------------+
|ItemID|    ItemName|   Category| Warehouse|StockQty|ReorderLevel|LastRestocked|UnitPrice| Supplier|NeedsReorder|TotalStockValue|
+------+------------+-----------+----------+--------+------------+-------------+---------+---------+------------+---------------+
|  I001|      LED TV|Electronics|WarehouseA|      50|          20|   2024-03-15|    30000|   AVTech|       false|        1500000|
|  I002|      Laptop|Electronics|WarehouseB|      10|          15|   2024-04-01|    70000|TechWorld|        true|         700000|
|  I003|Office Chair|  Furniture|WarehouseA|      40|          10|   2024-03-25|     6000|  ChairCo|       false|         240000|
+------+------------+-----------+----------+--------+------------+-------------+---------+---------+------------+---------------+



In [0]:
# Export the result as a Parquet file partitioned by Warehouse .
df.write.mode("overwrite").parquet("file:/Workspace/Shared/stocksvalue", partitionBy="Warehouse")

Warehouse Utilization

In [0]:
# Count items stored per warehouse.
stock_counts=df.groupBy("Warehouse").count().withColumnRenamed("count", "ItemCount")
stock_counts.show()

+----------+---------+
| Warehouse|ItemCount|
+----------+---------+
|WarehouseA|        2|
|WarehouseC|        1|
|WarehouseB|        2|
+----------+---------+



In [0]:
# Average stock per category in each warehouse.
avg_stock=df.groupBy("Warehouse","Category").agg(avg("StockQty").alias("AvgStock"))
avg_stock.show()

+----------+-----------+--------+
| Warehouse|   Category|AvgStock|
+----------+-----------+--------+
|WarehouseB|Electronics|     6.5|
|WarehouseA|  Furniture|    40.0|
|WarehouseC| Appliances|     5.0|
|WarehouseA|Electronics|    50.0|
+----------+-----------+--------+



In [0]:
# Determine underutilized warehouses ( total stock < 100 ).
from pyspark.sql.functions import expr
stock_counts.join(df.groupBy("Warehouse").agg(expr("sum(StockQty)").alias("TotalStock")), "Warehouse").filter(col("TotalStock") < 100).show()

+----------+---------+----------+
| Warehouse|ItemCount|TotalStock|
+----------+---------+----------+
|WarehouseA|        2|        90|
|WarehouseC|        1|         5|
|WarehouseB|        2|        13|
+----------+---------+----------+



Scenario 5: Delta Audit Trail

In [0]:
# Save as Delta table retail_inventory .
from delta.tables import DeltaTable
df.write.format("delta").mode("overwrite").save("file:/Workspace/Shared/retail_inventory")
delta = DeltaTable.forPath(spark, "file:/Workspace/Shared/retail_inventory")

In [0]:
# Update stock of 'Laptop' to 20.
delta.update(condition="ItemName='Laptop'", set={"StockQty": "20"})

In [0]:
# Delete any item with StockQty = 0 .
delta.delete("StockQty = 0")

In [0]:
# Run DESCRIBE HISTORY and query VERSION AS OF previous state.
spark.sql("DESCRIBE HISTORY delta.`file:/Workspace/Shared/retail_inventory`").show()
spark.read.format("delta").option("versionAsOf", 0).load("file:/Workspace/Shared/retail_inventory").show()
     

+-------+--------------------+----------------+--------------------+---------+--------------------+----+------------------+--------------------+-----------+-----------------+-------------+--------------------+------------+--------------------+
|version|           timestamp|          userId|            userName|operation| operationParameters| job|          notebook|           clusterId|readVersion|   isolationLevel|isBlindAppend|    operationMetrics|userMetadata|          engineInfo|
+-------+--------------------+----------------+--------------------+---------+--------------------+----+------------------+--------------------+-----------+-----------------+-------------+--------------------+------------+--------------------+
|      2|2025-06-19 08:44:...|4042796083082360|azuser3548_mml.lo...|   DELETE|{predicate -> ["(...|NULL|{4419187724732004}|0612-043650-nhuexwr6|          1|WriteSerializable|        false|{numRemovedFiles ...|        NULL|Databricks-Runtim...|
|      1|2025-06-19 08:4

Scenario 6: Alerts from Restock Logs (Join Task)

In [0]:

from delta.tables import DeltaTable
from pyspark.sql.functions import col, month, datediff, when, to_date
logs = spark.read.option("header", True).csv("file:/Workspace/Shared/restock_logs.csv") \
    .withColumnRenamed("QuantityAdded ", "QuantityAdded") \
    .withColumn("RestockDate", to_date("RestockDate", "yyyy-MM-dd"))
df = spark.read.format("delta").load("file:/Workspace/Shared/retail_inventory")
if 'RestockedRecently' not in df.columns:
    df = df.withColumn("RestockedRecently", col("StockQty") * 0 == 1)  
    df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").save("file:/Workspace/Shared/retail_inventory")
delta = DeltaTable.forPath(spark, "file:/Workspace/Shared/retail_inventory")


In [0]:
#Join with inventory table to update StockQty.
updated = df.alias("i").join(logs.alias("r"), "ItemID", "left") \
    .withColumn("NewStockQty", col("StockQty") + col("QuantityAdded")) \
    .withColumn("RestockedRecently", col("QuantityAdded").isNotNull())

In [0]:
# Use MERGE INTO to update in Delta.
delta.alias("t").merge(
    updated.select("ItemID", "NewStockQty", "RestockedRecently").alias("s"),
    "t.ItemID = s.ItemID"
).whenMatchedUpdate(set={
    "StockQty": "s.NewStockQty",
    "RestockedRecently": "s.RestockedRecently"
}).execute()

Scenario 7: Report Generation with SQL Views

In [0]:
# Create SQL view inventory_summary with:
# ItemName, Category, StockQty, NeedsReorder, TotalStockValue
spark.sql("""CREATE OR REPLACE TEMP VIEW inventory_summary AS
SELECT ItemName, Category, StockQty, NeedsReorder, StockQty*UnitPrice AS TotalStockValue
FROM delta.`file:/Workspace/Shared/retail_inventory`""")

DataFrame[]

In [0]:
#Create view supplier_leaderboard sorted by average price
spark.sql("""CREATE OR REPLACE TEMP VIEW supplier_leaderboard AS
SELECT Supplier, AVG(UnitPrice) AS AvgPrice
FROM delta.`file:/Workspace/Shared/retail_inventory`
GROUP BY Supplier
ORDER BY AvgPrice""")

DataFrame[]

Scenario 8: Advanced Filtering

In [0]:
# Use when / otherwise to categorize items:
# "Overstocked" (>2x ReorderLevel)
# "LowStock"
df=df.select(
    "ItemName", "Category", "StockQty", "ReorderLevel", "TotalStockValue","LastRestocked"
).withColumn(
    "NeedsReorder", col("StockQty") < col("ReorderLevel")
)
df.createOrReplaceTempView("inventory_summary")
df = spark.table("inventory_summary").withColumn("StockStatus",
    when(col("StockQty") > 2 * col("ReorderLevel"), "Overstocked")
    .when(col("StockQty") < col("ReorderLevel"), "LowStock")
    .otherwise("OK")
)

In [0]:
# Use .filter() and .where() for the same and compare.
df.filter(col("StockQty") < col("ReorderLevel")).show()
df.where("StockQty < ReorderLevel").show()

+--------+--------+--------+------------+---------------+-------------+------------+-----------+
|ItemName|Category|StockQty|ReorderLevel|TotalStockValue|LastRestocked|NeedsReorder|StockStatus|
+--------+--------+--------+------------+---------------+-------------+------------+-----------+
+--------+--------+--------+------------+---------------+-------------+------------+-----------+

+--------+--------+--------+------------+---------------+-------------+------------+-----------+
|ItemName|Category|StockQty|ReorderLevel|TotalStockValue|LastRestocked|NeedsReorder|StockStatus|
+--------+--------+--------+------------+---------------+-------------+------------+-----------+
+--------+--------+--------+------------+---------------+-------------+------------+-----------+



Scenario 9: Feature Engineering

In [0]:
# Extract RestockMonth from LastRestocked .
# Create feature: StockAge = CURRENT_DATE - LastRestocked
# Bucket StockAge into: New, Moderate, Stale
from pyspark.sql.functions import month, datediff, current_date, when
df = spark.table("inventory_summary")
df = df.withColumn("RestockMonth", month("LastRestocked")) \
       .withColumn("StockAge", datediff(current_date(), col("LastRestocked"))) \
       .withColumn("StockAgeBucket",
           when(col("StockAge") < 30, "New")
           .when(col("StockAge") < 90, "Moderate")
           .otherwise("Stale"))
df.select("ItemName", "RestockMonth", "StockAge", "StockAgeBucket").show()

+------------+------------+--------+--------------+
|    ItemName|RestockMonth|StockAge|StockAgeBucket|
+------------+------------+--------+--------------+
|      LED TV|           3|     461|         Stale|
|Office Chair|           3|     451|         Stale|
|Refrigerator|           2|     485|         Stale|
|     Printer|           3|     446|         Stale|
|      Laptop|           4|     444|         Stale|
+------------+------------+--------+--------------+



Scenario 10: Export Options

In [0]:
# Write full DataFrame to:
# CSV for analysts
# JSON for integration
# Delta for pipelines
# Save with meaningful file and partition names like
# /export/inventory/stale_items/

df.write.mode("overwrite").option("header",True) \
   .csv("file:/Workspace/Shared/export/inventory/all_items_csv")
df.write.mode("overwrite").json("file:/Workspace/Shared/export/inventory/all_items_json")
df.write.mode("overwrite").format("delta") \
   .save("file:/Workspace/Shared/export/inventory/all_items_delta")