# Retail Inventory & Supply Chain Intelligence

Creating Spark Session

In [34]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("RetailInventory").getOrCreate()
spark

#### Scenario 1: Inventory Alerting System

1. Load the data using PySpark

In [35]:
from google.colab import drive

drive.mount('/content/drive')

inventory_df = spark.read.format('csv') \
                       .option('header', True) \
                       .option('inferSchema', True) \
                       .load('/content/drive/MyDrive/Assessment/inventory_supply.csv')

inventory_df.printSchema()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
root
 |-- ItemID: string (nullable = true)
 |-- ItemName: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Warehouse: string (nullable = true)
 |-- StockQty: integer (nullable = true)
 |-- ReorderLevel: integer (nullable = true)
 |-- LastRestocked: date (nullable = true)
 |-- UnitPrice: integer (nullable = true)
 |-- Supplie: string (nullable = true)



Removing leading and tailing spaces in columns

In [36]:
inventory_df = inventory_df.toDF(*[col_name.strip() for col_name in inventory_df.columns])

Renaming column

In [37]:
inventory_df = inventory_df.withColumnRenamed('Supplie', 'Supplier')

2. Create a new column
NeedsReorder = StockQty < ReorderLevel .

In [38]:
from pyspark.sql.functions import col, when

inventory_df = inventory_df.withColumn('NeedsRecorder',
                                        when(col('StockQty') < col('ReorderLevel'), col('ReorderLevel') - col('StockQty'))
                                        .otherwise(0)
)

inventory_df.select(
                      'ItemID',
                      'ItemName',
                      'Category',
                      'Warehouse',
                      'StockQty',
                      'ReorderLevel',
                      'NeedsRecorder'
).show()

+------+------------+-----------+----------+--------+------------+-------------+
|ItemID|    ItemName|   Category| Warehouse|StockQty|ReorderLevel|NeedsRecorder|
+------+------------+-----------+----------+--------+------------+-------------+
|  I001|      LED TV|Electronics|WarehouseA|      50|          20|            0|
|  I002|      Laptop|Electronics|WarehouseB|      10|          15|            5|
|  I003|Office Chair|  Furniture|WarehouseA|      40|          10|            0|
|  I004|Refrigerator| Appliances|WarehouseC|       5|          10|            5|
|  I005|     Printer|Electronics|WarehouseB|       3|           5|            2|
+------+------------+-----------+----------+--------+------------+-------------+



3. Create a view of all items that need restocking.

In [40]:
restock_df = inventory_df.filter(col('NeedsRecorder') > 0)

restock_df.createOrReplaceTempView("items_need_restocking")

spark.sql("SELECT * FROM items_need_restocking").show()

+------+------------+-----------+----------+--------+------------+-------------+---------+----------+-------------+
|ItemID|    ItemName|   Category| Warehouse|StockQty|ReorderLevel|LastRestocked|UnitPrice|  Supplier|NeedsRecorder|
+------+------------+-----------+----------+--------+------------+-------------+---------+----------+-------------+
|  I002|      Laptop|Electronics|WarehouseB|      10|          15|   2024-04-01|    70000|TechWorld |            5|
|  I004|Refrigerator| Appliances|WarehouseC|       5|          10|   2024-02-20|    25000| FreezeIt |            5|
|  I005|     Printer|Electronics|WarehouseB|       3|           5|   2024-03-30|     8000| PrintFast|            2|
+------+------------+-----------+----------+--------+------------+-------------+---------+----------+-------------+



4. Highlight warehouses with more than 2 such items.

In [41]:
from pyspark.sql.functions import count

warehouse_counts = restock_df.groupBy('Warehouse') \
                             .agg(count('*').alias('ItemsToRestock')) \
                             .filter(col('ItemsToRestock') >= 2)
warehouse_counts.show()

+----------+--------------+
| Warehouse|ItemsToRestock|
+----------+--------------+
|WarehouseB|             2|
+----------+--------------+



#### Scenario 2: Supplier Price Optimization


1. Group items by Supplier and compute average price.

In [42]:
from pyspark.sql.functions import avg
average_price_df = inventory_df.groupBy('Supplier') \
                               .agg(avg('UnitPrice').alias('AveragePrice')) \


average_price_df.show()

+----------+------------+
|  Supplier|AveragePrice|
+----------+------------+
|  ChairCo |      6000.0|
| PrintFast|      8000.0|
|TechWorld |     70000.0|
| FreezeIt |     25000.0|
|   AVTech |     30000.0|
+----------+------------+



2. Find which suppliers offer items below average price in their category

In [43]:
avg_category_df = inventory_df.groupBy('Category') \
                              .agg(avg('UnitPrice').alias('AvgCategoryPrice'))

inventory_df = inventory_df.join(avg_category_df, on='Category', how='left')

below_avg_df = inventory_df.filter(col('UnitPrice') < col('AvgCategoryPrice')) \
                        .select(
                                  'Supplier',
                                  'ItemName',
                                  'Category',
                                  'UnitPrice',
                                  'AvgCategoryPrice'
                                )
below_avg_df.show()


+---------+--------+-----------+---------+----------------+
| Supplier|ItemName|   Category|UnitPrice|AvgCategoryPrice|
+---------+--------+-----------+---------+----------------+
|  AVTech |  LED TV|Electronics|    30000|         36000.0|
|PrintFast| Printer|Electronics|     8000|         36000.0|
+---------+--------+-----------+---------+----------------+



3. Tag suppliers with
Good Deal if >50% of their items are below market average

In [44]:
from pyspark.sql.functions import col, when, count, sum

suppliers_good_deal_df = inventory_df.withColumn('BelowAvgFlag',
                                              when(col('UnitPrice') < col('AvgCategoryPrice'), 1).otherwise(0)) \
                                  .groupBy('Supplier').agg(
                                                          count('*').alias('TotalItems'),
                                                          sum('BelowAvgFlag').alias('ItemsBelowAvg')
                                                         )

suppliers_good_deal_df = suppliers_good_deal_df.withColumn('DealTag',
                                                            when(col('ItemsBelowAvg') / col('TotalItems') > 0.5, 'Good Deal')
                                                            .otherwise('Normal')
                                                          )

suppliers_good_deal_df.select(
                                'Supplier',
                                'TotalItems',
                                'ItemsBelowAvg',
                                'DealTag'
                              ).show()


+----------+----------+-------------+---------+
|  Supplier|TotalItems|ItemsBelowAvg|  DealTag|
+----------+----------+-------------+---------+
|  ChairCo |         1|            0|   Normal|
| PrintFast|         1|            1|Good Deal|
|TechWorld |         1|            0|   Normal|
| FreezeIt |         1|            0|   Normal|
|   AVTech |         1|            1|Good Deal|
+----------+----------+-------------+---------+



#### Scenario 3: Cost Forecasting


1. Calculate
TotalStockValue = StockQty * UnitPrice

In [48]:
inventory_df = inventory_df.withColumn('TotalStockValue', col('StockQty') * col('UnitPrice'))

inventory_df.select(
                      'ItemID',
                      'ItemName',
                      'Category',
                      'Warehouse',
                      'StockQty',
                      'UnitPrice',
                      'TotalStockValue'
).show()

+------+------------+-----------+----------+--------+---------+---------------+
|ItemID|    ItemName|   Category| Warehouse|StockQty|UnitPrice|TotalStockValue|
+------+------------+-----------+----------+--------+---------+---------------+
|  I001|      LED TV|Electronics|WarehouseA|      50|    30000|        1500000|
|  I002|      Laptop|Electronics|WarehouseB|      10|    70000|         700000|
|  I003|Office Chair|  Furniture|WarehouseA|      40|     6000|         240000|
|  I004|Refrigerator| Appliances|WarehouseC|       5|    25000|         125000|
|  I005|     Printer|Electronics|WarehouseB|       3|     8000|          24000|
+------+------------+-----------+----------+--------+---------+---------------+



2. Identify top 3 highest-value items.

In [50]:
top3_highest_value_df = inventory_df.orderBy('TotalStockValue', ascending=False) \
                                    .limit(3)

top3_highest_value_df.select(
                              'ItemID',
                              'ItemName',
                              'Category',
                              'Warehouse',
                              'StockQty',
                              'UnitPrice',
                              'TotalStockValue'
).show()

+------+------------+-----------+----------+--------+---------+---------------+
|ItemID|    ItemName|   Category| Warehouse|StockQty|UnitPrice|TotalStockValue|
+------+------------+-----------+----------+--------+---------+---------------+
|  I001|      LED TV|Electronics|WarehouseA|      50|    30000|        1500000|
|  I002|      Laptop|Electronics|WarehouseB|      10|    70000|         700000|
|  I003|Office Chair|  Furniture|WarehouseA|      40|     6000|         240000|
+------+------------+-----------+----------+--------+---------+---------------+



3. Export the result as a Parquet file partitioned by
Warehouse

In [51]:
inventory_df.write.mode('overwrite') \
                  .partitionBy('Warehouse') \
                  .format('parquet') \
                  .save('/content/drive/MyDrive/Assessment/inventory/transformed_inventory')



In [52]:
top3_highest_value_df.write.mode('overwrite') \
                  .partitionBy('Warehouse') \
                  .format('parquet') \
                  .save('/content/drive/MyDrive/Assessment/inventory/highest_value_items')

#### Scenario 4: Warehouse Utilization

1. Count items stored per warehouse.

In [54]:
items_per_warehouse = inventory_df.groupBy('Warehouse') \
                                  .count() \
                                  .withColumnRenamed('count', 'TotalItems')

items_per_warehouse.show()

+----------+----------+
| Warehouse|TotalItems|
+----------+----------+
|WarehouseA|         2|
|WarehouseC|         1|
|WarehouseB|         2|
+----------+----------+



2. Average stock per category in each warehouse.

In [56]:
avg_stock_df = inventory_df.groupBy('Warehouse', 'Category') \
                           .agg(avg('StockQty').alias('AvgStockPerCategory'))

avg_stock_df.show()

+----------+-----------+-------------------+
| Warehouse|   Category|AvgStockPerCategory|
+----------+-----------+-------------------+
|WarehouseB|Electronics|                6.5|
|WarehouseA|  Furniture|               40.0|
|WarehouseC| Appliances|                5.0|
|WarehouseA|Electronics|               50.0|
+----------+-----------+-------------------+



3. Determine underutilized warehouses (
 total stock < 100 ).

In [58]:
under_utilized_warehouse_df = inventory_df.groupBy('Warehouse') \
                                          .agg(sum('StockQty').alias('TotalStock')) \
                                          .filter(col('TotalStock') < 100)

under_utilized_warehouse_df.show()

+----------+----------+
| Warehouse|TotalStock|
+----------+----------+
|WarehouseA|        90|
|WarehouseC|         5|
|WarehouseB|        13|
+----------+----------+



####  Scenario 5: Delta Audit Trail

1. Save as Delta table retail_inventory

In [None]:
inventory_df.write.mode('overwrite') \
               .format('delta') \
               .save('/content/drive/MyDrive/Assessment/inventory/retail_inventory')

2. Update stock of 'Laptop' to 20.

In [61]:
inventory_df = inventory_df.withColumn('StockQty',
                                        when(col('ItemName') == 'Laptop', 20).otherwise(col('StockQty')))

inventory_df.select(
                      'ItemID',
                      'ItemName',
                      'Category',
                      'Warehouse',
                      'StockQty',
                      'UnitPrice'
).show()

+------+------------+-----------+----------+--------+---------+
|ItemID|    ItemName|   Category| Warehouse|StockQty|UnitPrice|
+------+------------+-----------+----------+--------+---------+
|  I001|      LED TV|Electronics|WarehouseA|      50|    30000|
|  I002|      Laptop|Electronics|WarehouseB|      20|    70000|
|  I003|Office Chair|  Furniture|WarehouseA|      40|     6000|
|  I004|Refrigerator| Appliances|WarehouseC|       5|    25000|
|  I005|     Printer|Electronics|WarehouseB|       3|     8000|
+------+------------+-----------+----------+--------+---------+



3. Delete any item with
StockQty = 0

In [62]:
inventory_df = inventory_df.filter(col('StockQty') != 0)
inventory_df.select(
                      'ItemID',
                      'ItemName',
                      'Category',
                      'Warehouse',
                      'StockQty',
                      'UnitPrice'
).show()

+------+------------+-----------+----------+--------+---------+
|ItemID|    ItemName|   Category| Warehouse|StockQty|UnitPrice|
+------+------------+-----------+----------+--------+---------+
|  I001|      LED TV|Electronics|WarehouseA|      50|    30000|
|  I002|      Laptop|Electronics|WarehouseB|      20|    70000|
|  I003|Office Chair|  Furniture|WarehouseA|      40|     6000|
|  I004|Refrigerator| Appliances|WarehouseC|       5|    25000|
|  I005|     Printer|Electronics|WarehouseB|       3|     8000|
+------+------------+-----------+----------+--------+---------+



4. Run
DESCRIBE HISTORY and query
VERSION AS OF previous state.

In [None]:
history_df = spark.sql("""
                        DESCRIBE HISTORY delta.`/content/drive/MyDrive/Assessment/inventory/transformed_inventory`
                       """)

history_df.show()

previous_state_df = spark.read.format("delta") \
                              .option("versionAsOf", 0) \
                              .load('/content/drive/MyDrive/Assessment/inventory/transformed_inventory')

previous_state_df.show()

previous_state_df.createOrReplaceTempView('inventory_previous_version')

spark.sql("""
            SELECT *
            FROM inventory_previous_version
            WHERE StockQty < 10
          """).show()

####  Scenario 6: Alerts from Restock Logs

Creating Logs Dataframe

In [63]:
from pyspark.sql.types import StructType, StructField, StringType, DateType, IntegerType
from pyspark.sql.functions import to_date

data = [
    ('I002', '2024-04-20', 10),
    ('I005', '2024-04-22', 5),
    ('I001', '2024-04-25', 20)
]

schema = StructType([
    StructField('ItemID', StringType(), True),
    StructField('RestockDate', StringType(), True),
    StructField('QuantityAdded', IntegerType(), True)
])

restock_logs_df = spark.createDataFrame(data, schema)
restock_logs_df = restock_logs_df.withColumn('RestockDate', to_date('RestockDate', 'yyyy-MM-dd'))

restock_logs_df.printSchema()


root
 |-- ItemID: string (nullable = true)
 |-- RestockDate: date (nullable = true)
 |-- QuantityAdded: integer (nullable = true)



1.  Join with inventory table to update StockQty

In [66]:
joined_df = inventory_df.join(restock_logs_df, on='ItemID', how='left')

joined_df.show()

+------+-----------+------------+----------+--------+------------+-------------+---------+----------+-------------+----------------+---------------+-----------+-------------+
|ItemID|   Category|    ItemName| Warehouse|StockQty|ReorderLevel|LastRestocked|UnitPrice|  Supplier|NeedsRecorder|AvgCategoryPrice|TotalStockValue|RestockDate|QuantityAdded|
+------+-----------+------------+----------+--------+------------+-------------+---------+----------+-------------+----------------+---------------+-----------+-------------+
|  I001|Electronics|      LED TV|WarehouseA|      50|          20|   2024-03-15|    30000|   AVTech |            0|         36000.0|        1500000|       NULL|         NULL|
|  I002|Electronics|      Laptop|WarehouseB|      20|          15|   2024-04-01|    70000|TechWorld |            5|         36000.0|         700000| 2024-04-20|           10|
|  I003|  Furniture|Office Chair|WarehouseA|      40|          10|   2024-03-25|     6000|  ChairCo |            0|          

In [67]:
from pyspark.sql.functions import coalesce, lit

joined_df = joined_df.withColumn('QuantityAdded',
                                  coalesce(col('QuantityAdded'), lit(0)))

joined_df.select(
                  'ItemID',
                  'ItemName',
                  'Category',
                  'Warehouse',
                  'StockQty',
                  'UnitPrice',
                  'QuantityAdded'
).show()

+------+------------+-----------+----------+--------+---------+-------------+
|ItemID|    ItemName|   Category| Warehouse|StockQty|UnitPrice|QuantityAdded|
+------+------------+-----------+----------+--------+---------+-------------+
|  I001|      LED TV|Electronics|WarehouseA|      50|    30000|            0|
|  I002|      Laptop|Electronics|WarehouseB|      20|    70000|           10|
|  I003|Office Chair|  Furniture|WarehouseA|      40|     6000|            0|
|  I005|     Printer|Electronics|WarehouseB|       3|     8000|            5|
|  I004|Refrigerator| Appliances|WarehouseC|       5|    25000|            0|
+------+------------+-----------+----------+--------+---------+-------------+



2. Calculate new stock and flag
RestockedRecently = true for updated items.

In [71]:
from pyspark.sql.functions import col, when, lit

new_stock_df = joined_df.withColumn('NewStockQty', col('StockQty') + col('QuantityAdded')) \
                           .withColumn('RestockedRecently',
                                        when(col('QuantityAdded') > 0, lit(True)).otherwise(lit(False)))

new_stock_df.select(
                      'ItemID',
                      'ItemName',
                      'Category',
                      'Warehouse',
                      'StockQty',
                      'QuantityAdded',
                      'NewStockQty',
                      'RestockedRecently'
                    ).show()

+------+------------+-----------+----------+--------+-------------+-----------+-----------------+
|ItemID|    ItemName|   Category| Warehouse|StockQty|QuantityAdded|NewStockQty|RestockedRecently|
+------+------------+-----------+----------+--------+-------------+-----------+-----------------+
|  I001|      LED TV|Electronics|WarehouseA|      50|            0|         50|            false|
|  I002|      Laptop|Electronics|WarehouseB|      20|           10|         30|             true|
|  I003|Office Chair|  Furniture|WarehouseA|      40|            0|         40|            false|
|  I005|     Printer|Electronics|WarehouseB|       3|            5|          8|             true|
|  I004|Refrigerator| Appliances|WarehouseC|       5|            0|          5|            false|
+------+------------+-----------+----------+--------+-------------+-----------+-----------------+



3. Use MERGE INTO to update in Delta.

In [None]:
from delta.tables import DeltaTable
from pyspark.sql.functions import expr


delta_path = '/content/drive/MyDrive/Assessment/inventory/transformed_inventory'


delta_table = DeltaTable.forPath(spark, delta_path)


delta_table.alias('i').merge(restock_logs_df.alias('r'),'i.ItemID = r.ItemID') \
                      .whenMatchedUpdate(set={
                                              'StockQty': expr('i.StockQty + r.QuantityAdded'),
                                              'LastRestocked': 'r.RestockDate'
                                              }
                                         ).execute()


#### Scenario 7: Report Generation with SQL Views

1. Create SQL view inventory_summary

In [77]:
inventory_df.createOrReplaceTempView("inventory")

spark.sql("""
            CREATE OR REPLACE TEMP VIEW inventory_summary AS
            SELECT
              ItemName,
              Category,
              StockQty,
              NeedsRecorder,
              StockQty * UnitPrice AS TotalStockValue
            FROM inventory
""")

spark.sql("SELECT * FROM inventory_summary").show()

+------------+-----------+--------+-------------+---------------+
|    ItemName|   Category|StockQty|NeedsRecorder|TotalStockValue|
+------------+-----------+--------+-------------+---------------+
|      LED TV|Electronics|      50|            0|        1500000|
|      Laptop|Electronics|      20|            5|        1400000|
|Office Chair|  Furniture|      40|            0|         240000|
|Refrigerator| Appliances|       5|            5|         125000|
|     Printer|Electronics|       3|            2|          24000|
+------------+-----------+--------+-------------+---------------+



2. create view supplier_leaderboard sorted by average price

In [79]:
spark.sql("""
          CREATE OR REPLACE TEMP VIEW supplier_leaderboard AS
          SELECT
              Supplier,
              AVG(UnitPrice) AS AvgPrice
          FROM inventory
          GROUP BY Supplier
          ORDER BY AvgPrice DESC
""")

spark.sql("SELECT * FROM supplier_leaderboard").show()

+----------+--------+
|  Supplier|AvgPrice|
+----------+--------+
|TechWorld | 70000.0|
|   AVTech | 30000.0|
| FreezeIt | 25000.0|
| PrintFast|  8000.0|
|  ChairCo |  6000.0|
+----------+--------+



#### Scenario 8: Advanced Filtering

1. Use when /otherwise to categorize items: \
 "Overstocked" (>2x ReorderLevel) \
 "LowStock"

In [80]:
inventory_df = inventory_df.withColumn('StockCategory',
                                        when(col('StockQty') > 2 * col('ReorderLevel'), 'Overstocked')
                                        .when(col('StockQty') < col('ReorderLevel'), 'LowStock')
                                        .otherwise('Normal'))

inventory_df.select(
                      'ItemID',
                      'ItemName',
                      'StockQty',
                      'ReorderLevel',
                      'StockCategory'
                    ).show()

+------+------------+--------+------------+-------------+
|ItemID|    ItemName|StockQty|ReorderLevel|StockCategory|
+------+------------+--------+------------+-------------+
|  I001|      LED TV|      50|          20|  Overstocked|
|  I002|      Laptop|      20|          15|       Normal|
|  I003|Office Chair|      40|          10|  Overstocked|
|  I004|Refrigerator|       5|          10|     LowStock|
|  I005|     Printer|       3|           5|     LowStock|
+------+------------+--------+------------+-------------+



2. Use .filter() and .where() for the same and compare.

In [82]:
filtered_df = inventory_df.filter((col("StockQty") > 2 * col("ReorderLevel")) | (col("StockQty") < col("ReorderLevel")))

filtered_df.select(
                    'ItemID',
                    'ItemName',
                    'StockQty',
                    'ReorderLevel',
                    'StockCategory'
).show()

+------+------------+--------+------------+-------------+
|ItemID|    ItemName|StockQty|ReorderLevel|StockCategory|
+------+------------+--------+------------+-------------+
|  I001|      LED TV|      50|          20|  Overstocked|
|  I003|Office Chair|      40|          10|  Overstocked|
|  I004|Refrigerator|       5|          10|     LowStock|
|  I005|     Printer|       3|           5|     LowStock|
+------+------------+--------+------------+-------------+



#### Scenario 9: Feature Engineering


1. Extract
RestockMonth from
LastRestocked .

In [84]:
from pyspark.sql.functions import month
inventory_df = inventory_df.withColumn('RestockMonth', month(col('LastRestocked')))

inventory_df.select(
                      'ItemID',
                      'ItemName',
                      'Category',
                      'Warehouse',
                      'StockQty',
                      'UnitPrice',
                      'TotalStockValue',
                      'LastRestocked',
                      'RestockMonth'
).show()

+------+------------+-----------+----------+--------+---------+---------------+-------------+------------+
|ItemID|    ItemName|   Category| Warehouse|StockQty|UnitPrice|TotalStockValue|LastRestocked|RestockMonth|
+------+------------+-----------+----------+--------+---------+---------------+-------------+------------+
|  I001|      LED TV|Electronics|WarehouseA|      50|    30000|        1500000|   2024-03-15|           3|
|  I002|      Laptop|Electronics|WarehouseB|      20|    70000|         700000|   2024-04-01|           4|
|  I003|Office Chair|  Furniture|WarehouseA|      40|     6000|         240000|   2024-03-25|           3|
|  I004|Refrigerator| Appliances|WarehouseC|       5|    25000|         125000|   2024-02-20|           2|
|  I005|     Printer|Electronics|WarehouseB|       3|     8000|          24000|   2024-03-30|           3|
+------+------------+-----------+----------+--------+---------+---------------+-------------+------------+



2. Create feature:
StockAge = CURRENT_DATE - LastRestocked

In [87]:
from pyspark.sql.functions import current_date, datediff
inventory_df = inventory_df.withColumn('StockAge',  datediff(current_date(), col('LastRestocked')))

inventory_df.select(
                      'ItemID',
                      'ItemName',
                      'Category',
                      'Warehouse',
                      'StockQty',
                      'UnitPrice',
                      'TotalStockValue',
                      'LastRestocked',
                      'StockAge'
).show()

+------+------------+-----------+----------+--------+---------+---------------+-------------+--------+
|ItemID|    ItemName|   Category| Warehouse|StockQty|UnitPrice|TotalStockValue|LastRestocked|StockAge|
+------+------------+-----------+----------+--------+---------+---------------+-------------+--------+
|  I001|      LED TV|Electronics|WarehouseA|      50|    30000|        1500000|   2024-03-15|     461|
|  I002|      Laptop|Electronics|WarehouseB|      20|    70000|         700000|   2024-04-01|     444|
|  I003|Office Chair|  Furniture|WarehouseA|      40|     6000|         240000|   2024-03-25|     451|
|  I004|Refrigerator| Appliances|WarehouseC|       5|    25000|         125000|   2024-02-20|     485|
|  I005|     Printer|Electronics|WarehouseB|       3|     8000|          24000|   2024-03-30|     446|
+------+------------+-----------+----------+--------+---------+---------------+-------------+--------+



3. Bucket StockAge into: New, Moderate, Stale


In [88]:
bucketed_df = inventory_df.withColumn('StockAgeBucket',
                                      when(col('StockAge') <= 30, 'New')
                                      .when(col('StockAge') <= 90, 'Moderate')
                                      .otherwise('Stale'))

bucketed_df.select(
                    'ItemID',
                    'ItemName',
                    'LastRestocked',
                    'StockAge',
                    'StockAgeBucket'
    ).show()

+------+------------+-------------+--------+--------------+
|ItemID|    ItemName|LastRestocked|StockAge|StockAgeBucket|
+------+------------+-------------+--------+--------------+
|  I001|      LED TV|   2024-03-15|     461|         Stale|
|  I002|      Laptop|   2024-04-01|     444|         Stale|
|  I003|Office Chair|   2024-03-25|     451|         Stale|
|  I004|Refrigerator|   2024-02-20|     485|         Stale|
|  I005|     Printer|   2024-03-30|     446|         Stale|
+------+------------+-------------+--------+--------------+



#### Scenario 10: Export Options

Exporting to CSV

In [90]:
joined_df.write.format('csv') \
                .mode('overwrite') \
                .save('/content/drive/MyDrive/Assessment/inventory/inventory_transformed_csv.csv')


Exporting to JSON

In [91]:
joined_df.write.format('json') \
                .mode('overwrite') \
                .save('/content/drive/MyDrive/Assessment/inventory/inventory_transformed_json.json')

Exporting as Delta

In [None]:
joined_df.write.format('delta') \
               .mode('overwrite') \
               .save('/content/drive/MyDrive/Assessment/inventory/inventory_transformed_delta')

 2. Save file with  partitioned

In [None]:
joined_df.write.format('delta') \
               .mode('overwrite') \
               .partitionBy('Category') \
               .save('/content/drive/MyDrive/Assessment/inventory/inventory_transformed_by_category')

In [None]:
joined_df.write.format('delta') \
               .mode('overwrite') \
               .partitionBy('Warehouse') \
               .save('/content/drive/MyDrive/Assessment/inventory/inventory_transformed_by_Warehouse')

Viewing Saved File

In [92]:
!ls -l /content/drive/MyDrive/Assessment/inventory/

total 16
drwx------ 4 root root 4096 Jun 19 07:07 highest_value_items
drwx------ 2 root root 4096 Jun 19 08:30 inventory_transformed_csv.csv
drwx------ 2 root root 4096 Jun 19 08:31 inventory_transformed_json.json
drwx------ 5 root root 4096 Jun 19 07:07 transformed_inventory
