In [0]:
from pyspark.sql.functions import *
from delta.tables import *

In [0]:
df = spark.read.table("inventory_project.bronze.wms_putaway_movement_raw")
display(df)

In [0]:
df_std = df.select(
    trim(col("movement_id")).alias("movement_id"),
    trim(col("product_id")).alias("product_id"),
    trim(col("lot_id")).alias("lot_id"),
    trim(col("serial_id")).alias("serial_id"),
    trim(col("from_location")).alias("from_location"),
    trim(col("to_bin")).alias("to_bin"),
    trim(col("movement_date")).alias("movement_date"),
    col("quantity").cast("int").alias("quantity"),
    trim(col("operator_id")).alias("operator_id"),
    upper(trim(col("cdc_op"))).alias("cdc_op")
)
df_std = df_std.withColumn("movement_date", to_date(col("movement_date"), "yyyy-MM-dd"))
df_std = df_std.withColumn("cdc_op",
        when(col("cdc_op").contains("U"),"U")\
        .when(col("cdc_op").contains("I"),"I")\
        .when(col("cdc_op").contains("D"),"D")\
        .otherwise("U").alias("cdc_op")
)

In [0]:
valid_condition = (
    col("movement_id").isNotNull() &
    col("product_id").isNotNull() &
    col("from_location").isNotNull() &
    col("to_bin").isNotNull() &
    col("movement_date").isNotNull() &
    (col("quantity") > 0) &
    col("operator_id").isNotNull() &
    col("cdc_op").isin("I", "U", "D")
)

df_valid = df_std.filter(valid_condition).dropDuplicates(["movement_id", "cdc_op"])
df_invalid = df_std.filter(~valid_condition)

In [0]:
if spark.catalog.tableExists("inventory_project.silver.wms_putaway_movement"):
    delta_table = DeltaTable.forName(spark, "inventory_project.silver.wms_putaway_movement")
    (
        delta_table.alias("t")
        .merge(
            df_valid.alias("s"),
            "t.movement_id = s.movement_id"
        )
        .whenMatchedUpdate(
            condition="s.cdc_op = 'U'",
            set={
                "product_id": "s.product_id",
                "lot_id": "s.lot_id",
                "serial_id": "s.serial_id",
                "from_location": "s.from_location",
                "to_bin": "s.to_bin",
                "movement_date": "s.movement_date",
                "quantity": "s.quantity",
                "operator_id": "s.operator_id",
                "cdc_op": "s.cdc_op"
            }
        )
        .whenMatchedDelete(condition="s.cdc_op = 'D'")
        .whenNotMatchedInsertAll(condition="s.cdc_op = 'I'")
        .execute()
    )
else:
    df_valid.write.format("delta").mode("overwrite").saveAsTable("inventory_project.silver.wms_putaway_movement")

# Step 7: Write invalid rows to error table
df_invalid.write.format("csv").mode("overwrite").save("/Volumes/inventory_project/silver/quarantine_layer/wms_putaway_movement")
dbutils.notebook.exit("SUCCESS")