## WMS Bronze Ingestion

In [0]:
from pyspark.sql.functions import *
from datetime import datetime
from pyspark.sql import Row
from delta.tables import DeltaTable

In [0]:
def updateFileLookup(table_name, status):
    fileLookup = spark.read.table("inventory_project.metadata.filelookup")
    # Filter only the specific table row
    df = fileLookup.filter(col("table_name") == table_name) \
                   .withColumn("lastRunStatus", lit(status)) \
                   .withColumn("lastUpdatetime", current_timestamp())
    
    target_table = DeltaTable.forName(spark, "inventory_project.metadata.filelookup")

    # Perform merge (upsert)
    target_table.alias("t") \
        .merge(
            df.alias("s"),
            "t.table_name = s.table_name"
        ) \
        .whenMatchedUpdate(set={              
            "lastRunStatus": "s.lastRunStatus",
            "lastUpdatetime": "s.lastUpdatetime"
        }) \
        .whenNotMatchedInsertAll() \
        .execute()

In [0]:
def ingest_bronze_data(row):
    source_path = row['source_path']
    source_format = row['source_format']
    table_name = row['table_name']
    target_table = row['target_table']
    schema_path = f"/Volumes/inventory_project/bronze/wms_raw/schemas/{table_name}/schema"
    checkpoint_path = f"/Volumes/inventory_project/bronze/wms_raw/checkpoints/{table_name}/checkpoint"

    # Autoloader with schema enforcement + schema evolution
    df = (spark.readStream
        .format("cloudFiles")
        .option("cloudFiles.format", source_format)
        .option("pathGlobFilter", f"*.{source_format}") \
        .option("header", "true")
        .option("cloudFiles.schemaLocation", schema_path)          # schema tracking
        .option("cloudFiles.schemaEvolutionMode", "addNewColumns") # allow new columns
        .load(source_path)
        .withColumn("file_path", col("_metadata.file_path"))
        .withColumn("ingestion_time", current_timestamp())
    )

    # Write to Bronze Delta table
    (df.writeStream
    .format("delta")
    .option("checkpointLocation", checkpoint_path)
    .outputMode("append")
    .option("mergeSchema", "true")
    .trigger(availableNow=True)  # batch-like trigger for existing files
    .toTable(target_table))


    


In [0]:
fileLookup_df = (
    spark.read.table("inventory_project.metadata.filelookup")
         .filter((col("schema") == "bronze") & (col("description").startswith("WMS")))
)
fileLookup_df = fileLookup_df.collect()

audit_rows = []
for row in fileLookup_df:
    name = row['table_name']
    status = 'success'
    start_time = datetime.now()
    try:
        print(f"Started processing {row['table_name']}")
        ingest_bronze_data(row)
    except Exception as e:
        status = 'failed'
        error = str(e)
        print(f"Error processing {row['table_name']}: {e}")
        raise e
    finally:
        print(f"Finished processing {row['table_name']}")
        end_time = datetime.now()
        print(f"Notebook {name} took {end_time-start_time} seconds to run")
        updateFileLookup(name,status)
        audit_rows.append(Row(table_name = name, 
                              start_time = start_time, 
                              end_time = end_time, 
                              duration = str((end_time-start_time).total_seconds()), 
                              status = status,
                              error_message = error,
                              created_by = user,
                              created_date = datetime.now()))

In [0]:
# Add entry to audit table
if audit_rows:   # only if list is not empty
    audit_df = spark.createDataFrame(audit_rows).withColumn('created_by', lit('system'))
    audit_df.write \
        .format("delta") \
        .mode("append") \
        .option("mergeSchema", "true")\
        .saveAsTable("inventory_project.metadata.audit_log")

In [0]:
%sql
select * from inventory_project.metadata.filelookup