# ERP Silver Transformation

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import IntegerType

fileLookup_df = (
    spark.read.table("inventory_project.metadata.filelookup")
         .filter((col("schema") == "silver") & (col("description").startswith("ERP")))
)

display(fileLookup_df)

In [0]:
def clean_erp_supplier_silver(df):
    clean_po = (
        df
        # standardize column names
        .withColumnRenamed("SUPP_ID", "supplier_id")
        .withColumnRenamed("SuP_name", "supplier_name")
        .withColumnRenamed("LeadDays", "lead_days")
        .withColumnRenamed("incoterm ", "international_coterm")
        .withColumnRenamed("Email", "email")
        .withColumnRenamed("updtime", "updated_ts")
        .withColumnRenamed("ingestion_timestamp", "ingestion_date")
        .withColumn(
        "lead_days",
        when(col("lead_days").rlike("^[0-9]+$"), col("lead_days").cast(IntegerType()))
         .otherwise(10)
        )
        .withColumn("international_coterm", 
            when(upper(col("incoterm")).isin("FOB","CIF","DAP"), 
                    upper(col("incoterm")))
            .otherwise(lit(None)))
    )
    valid_df = (clean_po
    .filter(col("supplier_id").isNotNull())
    .dropDuplicates(["supplier_id"])
    )
    return valid_df


In [0]:
def clean_erp_receipts_silver(df):
    clean_po = (
        df
        # standardize column names
        .withColumnRenamed("RcptID", "receipt_id")
        .withColumnRenamed("po_num", "po_id")
        .withColumnRenamed("Prod", "product_id")
        .withColumnRenamed("Recvd_QTY", "quantity_received")
        .withColumnRenamed("Rec_Date", "recieved_date")
        .withColumnRenamed("wh_loc", "warehouse_location")
        .withColumnRenamed("updated", "updated_ts")
        .withColumnRenamed("ingestion_timestamp", "ingestion_date")

        .withColumn("warehouse_location", trim(col("warehouse_location")))
        .withColumn("quantity_received",
                when(col("quantity_received").rlike("^[0-9]+$"), col("quantity_received").cast("int"))
                .otherwise(0))
        .withColumn("recieved_date", to_date(col("recieved_date"), "dd-MM-yyyy"))
        .withColumn("updated_ts", to_timestamp(col("updated_ts"), "dd-MM-yyyy HH:mm"))
        .withColumn("ingestion_date", to_date(col("ingestion_date")))
    
    # Flag invalid product IDs (example: if Prod == 999 mark as INVALID)
        .withColumn("status_flag",
                when(col("product_id") == "999", "INVALID")
                .otherwise("VALID"))
    )
    valid_df = (clean_po
    .filter(col("receipt_id").isNotNull())
    .dropDuplicates(["receipt_id"])
    )  
    return valid_df


In [0]:
def clean_erp_purchase_order_silver(df):
    clean_po = (
        df
        # standardize column names
        .withColumnRenamed("PO# ", "po_id")
        .withColumnRenamed("prod_id ", "product_id")
        .withColumnRenamed("suppid", "supplier_id")
        .withColumnRenamed("QtyOrdered", "qty_ordered")
        .withColumnRenamed("Order_dt", "order_date")
        .withColumnRenamed("Expected_dt", "expected_date")
        .withColumnRenamed("Sts", "status")

        # qty_ordered: cast to integer, invalid becomes null
        .withColumn("qty_ordered",col("qty_ordered").rlike("^[0-9]+$"))
        .withColumn("qty_ordered",col("qty_ordered").cast(IntegerType()))

        # dates: convert string ‚Üí proper date
        .withColumn(
            "order_date",
            to_date(
                coalesce(
                    expr("try_to_timestamp(order_date, 'yyyy-MM-dd')"),
                    expr("try_to_timestamp(order_date, 'yyyy/MM/dd')"),
                    expr("try_to_timestamp(order_date, 'dd-MM-yyyy')")
                )
            )
        )
        .withColumn("expected_date",to_date("expected_date", "dd-MM-yyyy"))
        .withColumn("ingestion_date", to_date(col("ingestion_timestamp")))

        # status: normalize to Title case (Open / Closed)
        .withColumn("status",lower(col("status")))
        .withColumn("status",when(col("status") == "open", "Open")
                            .when(col("status") == "closed", "Closed")
                            .otherwise("Unknown"))

        # flag invalid product_id (example: product_id not in product_dim)
        .withColumn("invalid_product",when(col("product_id") == "999",lit(True)).otherwise(lit(False)))
    )
    valid_df = (clean_po
    .filter(col("po_id").isNotNull())
    .dropDuplicates(["po_id"])
    )
    return valid_df

In [0]:
def clean_erp_products_silver(df):
    cleaned_df = (
        df
        # Rename columns
        .withColumnRenamed("prodID", "product_id")
        .withColumnRenamed("SKU_CODE", "sku_code")
        .withColumnRenamed("Br@nd", "brand")
        .withColumnRenamed("Cat", "category")
        .withColumnRenamed("Cost$$", "cost")
        .withColumnRenamed("stAtus ", "status")
        .withColumnRenamed("eff_from", "effective_from")
        .withColumnRenamed("effectivTo", "effective_to")
        .withColumnRenamed("upd_ts", "updated_ts")
        
        # Trim text fields
        .withColumn("brand", trim(col("brand")))
        .withColumn("category", trim(col("category")))
        .withColumn("status", trim(col("status")))

        # Standardize category names
        .withColumn(
            "category",
            when(upper(col("category")) == 'ELEC', 'Electronics')
            .when(upper(col("category")) == 'TOYS', 'Toys')
            .otherwise(col("category"))
        )

        # Standardize status
        .withColumn(
            "status",
            when(upper(col("status")).like("ACT%"), "Active")
            .when(upper(col("status")).like("DISC%"), "Discontinued")
            .otherwise("Unknown")
        )

        # Clean numeric and date fields
        .withColumn("cost", regexp_replace(col("cost"), '"', "").cast("double"))
        .withColumn("effective_from", to_date(col("effective_from"), "dd-MM-yyyy"))
        .withColumn("effective_to", to_date(col("effective_to"), "dd-MM-yyyy"))
        .withColumn("updated_ts", regexp_replace("updated_ts", "/", "-"))
        .withColumn("updated_ts", expr("try_to_timestamp(updated_ts, 'yyyy-MM-dd HH:mm:ss')"))
        .withColumn("ingestion_date", to_date(col("ingestion_timestamp")))
    )
    valid_df = (cleaned_df
    .filter(col("product_id").isNotNull())
    .filter(col("sku_code").isNotNull())
    .filter(col("cost") >= 0)
    .dropDuplicates(["product_id"])
    )
    return valid_df

In [0]:
for row in fileLookup_df.collect():
    tableName = row['table_name']
    sourcePath = row['source_path']
    modeType = row['load_type']
    targetTable = row['target_table']
    cleaning_functions = {
    "erp_products_silver": clean_erp_products_silver,
    "erp_purchase_order_silver": clean_erp_purchase_order_silver,
    "erp_receipts_silver": clean_erp_receipts_silver,
    "erp_supplier_silver": clean_erp_supplier_silver
    }
    ## Processing Starts
    print(f"Processing: {tableName}")
    df = spark.read.format("parquet").load(sourcePath)
    clean_func = cleaning_functions.get(tableName)
    valid_df = clean_func(df)
    total_count = df.count()
    valid_count = valid_df.count()
    invalid_count = total_count - valid_count
    print(f"   üìä Total: {total_count}, ‚úÖ Valid: {valid_count}, ‚ùå Invalid: {invalid_count}")

    valid_df.write\
    .format("delta")\
    .mode(modeType)\
    .saveAsTable(targetTable)
    print(f"   ‚úÖ Successfully written to {tableName}")


In [0]:
print("Silver Table Processing")
cleaned_df = clean_erp_products_silver(df)
display(cleaned_df)
total_count = df.count()
valid_count = cleaned_df.count()
invalid_count = total_count - valid_count
print(f"Total: {total_count}, Valid: {valid_count}, Invalid: {invalid_count}")
cleaned_df.write\
    .format("delta")\
    .mode("overwrite")\
    .saveAsTable("table")
