In [0]:

df_bronze_pos = spark.read.format("delta").table("basab_catalog_retail.bronze_tables.pos_bronze")
df_bronze_products = spark.read.format("delta").table("basab_catalog_retail.bronze_tables.products_bronze")
df_bronze_inventory = spark.read.format("delta").table("basab_catalog_retail.bronze_tables.inventory_bronze")
df_bronze_stores = spark.read.format("delta").table("basab_catalog_retail.bronze_tables.stores_bronze")
df_bronze_weather = spark.read.format("delta").table("basab_catalog_retail.bronze_tables.weather_bronze")
df_bronze_holiday = spark.read.format("delta").table("basab_catalog_retail.bronze_tables.holiday_bronze")


In [0]:
%sql
select * from basab_catalog_retail.bronze_tables.pos_bronze

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:
df_silver_pos = (
    df_bronze_pos
    # Drop rescued data
    .drop("_rescued_data")

    # Standardize datatypes
    .withColumn("date", to_date(col("date"), "dd-MM-yyyy"))
    .withColumn("units_sold", col("units_sold").cast("int"))
    .withColumn("price", col("price").cast("double"))
    .withColumn("promo_flag",
                when(col("promo_flag").isin("0", "1", "0.0", "1.0"),
                     col("promo_flag").cast("int"))
                .otherwise(None))   # invalid promos â†’ null
)

In [0]:
df_silver_pos.display()

In [0]:
df_silver_pos2=df_silver_pos.filter(~col('store_id').startswith('INV') & col('store_id').isNotNull())


In [0]:
# Removing Nulls and INVALID_IDS from store_ids
df_silver_pos2.display()

In [0]:
# Filling in nulls in rest of the cols with suitable value
df_silver_pos3=df_silver_pos2.fillna({
    "sku_id":"Unknown",
    "date":"1900-01-01",
    "units_sold":0,
    "price":0,
    "promo_flag":0
})

In [0]:
df_silver_pos3.display()

In [0]:
df_silver_pos4=df_silver_pos3.withColumn("sku_id",when(col("sku_id").startswith("INV"),"Unknown").otherwise(col("sku_id")))

In [0]:
df_silver_pos4.display()

In [0]:
df_silver_pos4.write.format("delta").option("checkpointLocation","abfss://basab@basabstore.dfs.core.windows.net/pos_raw/silver_checkpoints/").option("mergeSchema","true").mode("append").saveAsTable("basab_catalog_retail.silver_tables.pos_silver")

In [0]:
%sql
select * from basab_catalog_retail.silver_tables.pos_silver

In [0]:
df_bronze_stores.display()

In [0]:
df_silver_stores = (
    df_bronze_stores
    # Drop rescued data
    .drop("_rescued_data")

    # Standardize datatypes
    .withColumn("size_sqft", col("size_sqft").cast("int"))
    .withColumn("opening_date", to_date(col("opening_date"), "yyyy-MM-dd"))
    .withColumn("region", upper(col("region")))
)



In [0]:
df_silver_stores.limit(2).display()

In [0]:
df_silver_stores2=df_silver_stores.filter(~col('store_id').startswith('INV') & col('store_id').isNotNull())



In [0]:
df_silver_stores3=df_silver_stores2.fillna({
    "region":"Unknown",
    "format":"Unknown",
    "size_sqft":0,
    "opening_date":"1900-01-01"
})

In [0]:
df_silver_stores3.display()


In [0]:
df_silver_stores4=df_silver_stores3.withColumn("format",when(col("format").startswith("INV"),"Unknown").otherwise(col("format")))

In [0]:
df_silver_stores4.write.format("delta").option("checkpointLocation","abfss://basab@basabstore.dfs.core.windows.net/stores_raw/silver_checkpoints/").option("mergeSchema","true").mode("append").saveAsTable("basab_catalog_retail.silver_tables.stores_silver")

In [0]:
%sql

select * from basab_catalog_retail.silver_tables.stores_silver

In [0]:
df_bronze_products.display()

In [0]:
df_silver_products=df_bronze_products.filter(~col('sku_id').startswith('INV') & col('sku_id').isNotNull())

df_silver_products2=df_silver_products.fillna({
    "category":"Unknown",
    "subcategory":"Unknown",
    "brand":"Unknown"
})



In [0]:
df_silver_products3=df_silver_products2.withColumn("category",when(col("category").startswith("INV"),"Unknown").otherwise(col("category"))).withColumn("subcategory",when(col("subcategory").startswith("INV"),"Unknown").otherwise(col("subcategory"))).withColumn("brand",when(col("brand").startswith("INV"),"Unknown").otherwise(col("brand")))


In [0]:
df_silver_products3.display()

In [0]:
df_silver_products4=spark.read.table("basab_catalog_retail.silver_tables.products_silver")

In [0]:
df_silver_products4.display()

In [0]:
df_silver_products4=df_silver_products4.drop("_rescued_data")
df_silver_products4.display()

In [0]:
df_silver_products4.write.format("delta").option("checkpointLocation","abfss://basab@basabstore.dfs.core.windows.net/products_raw/silver_checkpoints/").option("mergeSchema","true").mode("overwrite").saveAsTable("basab_catalog_retail.silver_tables.products_silver")

In [0]:
%sql
select * from basab_catalog_retail.silver_tables.products_silver

In [0]:
df_bronze_inventory.printSchema()

In [0]:
from pyspark.sql.functions import *
df_silver_inventory=df_bronze_inventory.filter(~col('store_id').startswith('INV') & col('store_id').isNotNull())



In [0]:
df_silver_inventory2 = (
    df_silver_inventory
    .drop("_rescued_data")
    .withColumn("stock_level", round(col("stock_level").cast("double")).cast("int"))
)

In [0]:
df_silver_inventory2.display()

In [0]:
df_silver_inventory3=df_silver_inventory2.withColumn("sku_id",when(col("sku_id").startswith("INV"),"Unknown").otherwise(col("sku_id")))

df_silver_inventory4=df_silver_inventory3.fillna({
    "sku_id":"Unknown",
    "stock_level":0
})

In [0]:
df_silver_inventory4.display()


In [0]:
df_silver_inventory4.write.format("delta").option("checkpointLocation","abfss://basab@basabstore.dfs.core.windows.net/inventory_raw/silver_checkpoints/").option("mergeSchema","true").mode("append").saveAsTable("basab_catalog_retail.silver_tables.inventory_silver")

In [0]:
%sql
select * from basab_catalog_retail.silver_tables.inventory_silver

In [0]:
df_bronze_holiday.display()

In [0]:
df_silver_holiday=df_bronze_holiday.withColumn("date",to_date(col("date"),"yyyy-MM-dd"))


In [0]:
df_silver_holiday.write.format("delta").mode("overwrite").saveAsTable("basab_catalog_retail.silver_tables.holiday_silver")


In [0]:
%sql
select * from basab_catalog_retail.silver_tables.holiday_silver

In [0]:
df_bronze_weather.limit(2).display()

In [0]:
df_bronze_weather.printSchema()

In [0]:
df_silver_weather=df_bronze_weather.drop("_rescued_data")


In [0]:
df_silver_weather.display()

In [0]:
df_silver_weather2=df_silver_weather.filter(~col('date').startswith('INV') & col('date').isNotNull())


In [0]:
df_silver_weather3=df_silver_weather2.fillna({
    "region":"Unknown",
    "temperature_c":0.0,
    "rainfall_mm":0.0
})

In [0]:
df_silver_weather4=df_silver_weather3.withColumn("date",to_date(col("date"),"yyyy-MM-dd"))\
                                        .withColumn("temperature_c",round(col("temperature_c").cast("double"),2))\
                                        .withColumn("rainfall_mm",round(col("rainfall_mm").cast("double"),2))\
                                        .withColumn("region",when(col("region").startswith("INV"),"Unknown").otherwise(col("region")))




In [0]:
df_silver_weather4.display()

In [0]:
df_silver_weather4.printSchema()

In [0]:
df_silver_weather4.write.format("delta").option("checkpointLocation","abfss://basab@basabstore.dfs.core.windows.net/weather_raw/silver_checkpoints/").option("mergeSchema","true").mode("append").saveAsTable("basab_catalog_retail.silver_tables.weather_silver")

In [0]:
%sql
select * from basab_catalog_retail.silver_tables.weather_silver