In [0]:
from pyspark.sql.functions import *
from delta.tables import *

In [0]:
df = spark.read.table("inventory_project.bronze.wms_bin_lot_raw")
display(df)

In [0]:
df_clean = df.select([trim(col(c)).alias(c) for c in df.columns])

df_clean = df_clean.withColumn(
    "bin_type",
    when(
        upper(col("bin_type")).isin("STG", "PICK", "BULK", "RETURNS"),
        upper(col("bin_type"))
    ).otherwise("Unknown")
)

df_clean = df_clean.withColumn(
    "condition",
    when(lower(col("condition")).contains("new"), "New")
    .when(lower(col("condition")).contains("damaged"), "Damaged")
    .when(lower(col("condition")).contains("return"), "Return")
    .otherwise("Unknown")
)

df_clean = df_clean.withColumn(
    "lot_expiry_date",
    when(
        col("lot_expiry_date") == "2025/31/12",
        "2025-12-31"
    ).when(
        col("lot_expiry_date") == "31-12-2025",
        "2025-12-31"
).otherwise(to_date(col("lot_expiry_date"), "yyyy-MM-dd")))

df_clean = df_clean.withColumn(
    "received_date",
    to_date(col("received_date"), "yyyy-MM-dd")
)

In [0]:
# 5. Split valid vs quarantine
df_valid = df_clean.filter(
    col("bin_id").isNotNull() &
    col("lot_id").isNotNull() &
    col("received_date").isNotNull() &
    col("lot_expiry_date").isNotNull()
)

df_quarantine = df_clean.filter(
    col("bin_id").isNull() |
    col("lot_id").isNull() |
    col("received_date").isNull() |
    col("lot_expiry_date").isNull()
).withColumn(
    "dq_reason",
    when(col("bin_id").isNull(), "Missing bin_id")
     .when(col("lot_id").isNull(), "Missing lot_id")
     .when(col("received_date").isNull(), "Invalid received_date format")
     .when(col("lot_expiry_date").isNull(), "Invalid lot_expiry_date format")
     .otherwise("Unknown")
)

In [0]:
df_valid.write.format("delta").mode("overwrite").saveAsTable("inventory_project.silver.wms_bin_lot")
df_quarantine.write.format("csv").mode("overwrite").save("/Volumes/inventory_project/silver/quarantine_layer/wms_bin_lot")
dbutils.notebook.exit("SUCCESS")