In [0]:
from pyspark.sql import functions as F
from pyspark.sql.types import StringType, DateType
from pyspark.sql.window import Window

# ---------------------------------------------------------------------
# 0. BRONZE TABLE & SILVER TARGETS
# ---------------------------------------------------------------------

bronze_table = "abc.abc_dw_bronze.abc_dw_br_procurment_raw_data"

po_silver_path = "/Volumes/abc/abc_dw_silver/silver_dataset/silver/procurement/po/"
pr_silver_path = "/Volumes/abc/abc_dw_silver/silver_dataset/silver/procurement/pr/"

po_silver_table = "abc.abc_dw_silver.abc_dw_sl_procurement_po"
pr_silver_table = "abc.abc_dw_silver.abc_dw_sl_procurement_pr"

# ---------------------------------------------------------------------
# 1. READ BRONZE DATA
# ---------------------------------------------------------------------

df_bronze = spark.table(bronze_table)

print("Bronze rows:", df_bronze.count())

# ---------------------------------------------------------------------
# 2. DATE STANDARDIZATION (YYYY/MM/DD)
#    - Internally we keep them as DateType
#    - Format YYYY/MM/DD is ensured when exporting / consuming
# ---------------------------------------------------------------------

po_date_cols = [
    "po_createdon"
    "po_purchasingdocdate",
    "po_postingdate",
    "po_approvaldate",
    "po_deliverydate",
    "po_paymentdate",
    "po_grpostingdate",
    "po_grdocumentdate",
    "po_grentrydate",
    "po_latestgrpostingdate",
    "po_latestgrdocumentdate",
    "po_latestgrentrydate",
    "po_invpostingdate",
    "po_invdocumentdate",
    "po_inventrydate",
    "po_latestinvpostingdate",
    "po_latestinvdocumentdate",
    "po_latestinventrydate",
]

pr_date_cols = [
    "pr_creationdate",
    "pr_approveddate"
]

# Convert all listed columns to proper DateType
# Use try_to_date to avoid errors from malformed date strings
for dc in po_date_cols + pr_date_cols:
    if dc in df_bronze.columns:
        df_bronze = df_bronze.withColumn(
            dc,
            F.try_to_date(F.col(dc), "yyyyMMdd")
        )

# ---------------------------------------------------------------------
# 3. BUILD PO SILVER DATASET
#    - Filter out deleted POs
#    - Keep PO-related columns + shared keys
# ---------------------------------------------------------------------

# Define PO-relevant columns (you can adjust if needed)
po_cols = [
    "purchaseorder",
    "po_itemnumber",
    "purchaserequisition",
    "po_serialnumber",
    "po_returnsitem",
    "po_purchasingdoctype",
    "po_purchasingdoctypedesc",
    "po_purchasingdocdate",
    "po_deletionindicator",
    "po_mastercontract",
    "po_purchasingorg",
    "po_purchasingorgdesc",
    "po_companycode",
    "po_companycodedesc",
    "po_purchasinggroup",
    "po_purchasinggroupdesc",
    "po_vendornumber",
    "po_vendorname",
    "po_vendoraccountgroup",
    "po_vendoraccountgroupdesc",
    "po_supplierclassification",
    "po_countrykey",
    "po_localorinternational",
    "po_plant",
    "po_plantdesc",
    "po_agreementnumber",
    "po_cbsdocnumber",
    "po_cbsitemnumber",
    "po_storageloc",
    "po_createdon",
    "po_createdby",
    "po_createdby_name",
    "po_documentyear",
    "po_accountassignmentcat",
    "po_accountassignmentcatdesc",
    "po_accountassignmentcatvalue",
    "po_accntassignmentcatvaluedesc",
    "po_version",
    "po_incoterms1",
    "po_incoterms2",
    "po_variationorderreq",
    "po_processingstatuscode",
    "po_processingstatus",
    "po_postingdate",
    "po_approvaldate",
    "po_paymentterms",
    "po_paymenttermsdesc",
    "material",
    "materialdesc",
    "materialgroup",
    "materialgroupdesc",
    "materialtype",
    "materialtypedesc",
    "po_deliverydate",
    "po_unitofmeasure",
    "po_orderpriceunit",
    "po_orderquantity",
    "po_priceunit",
    "po_netprice",
    "po_currency",
    "po_netordervalue",
    "po_exchangerate",
    "po_netamount",
    "po_deliveredqty",
    "po_deliveredqtyvalue",
    "po_invoiceqty",
    "po_invoiceqtyvalue",
    "po_pendingdeliveryqty",
    "po_pendingdeliveryvalue",
    "po_pendinginvoiceqty",
    "po_pendinginvoicevalue",
    "po_deliverycompleteindicator",
    "po_goodsreceipt",
    "po_grpostingdate",
    "po_grdocumentdate",
    "po_grentrydate",
    "po_latestgrpostingdate",
    "po_latestgrdocumentdate",
    "po_latestgrentrydate",
    "po_paymentstatus",
    "po_paymentagainstinvoice",
    "po_advancepayment",
    "po_paymentdate",
    "po_invoicereceipt",
    "po_invpostingdate",
    "po_invdocumentdate",
    "po_inventrydate",
    "po_latestinvpostingdate",
    "po_latestinvdocumentdate",
    "po_latestinventrydate",
    "po_grnonvaluated",
    "po_approvalageing",
    "po_grageing",
    "po_headerstatus",
    "po_itemstatus",
    "recordskipindicator",
    "lastchangedatetime"
]

# Select only columns that exist in Bronze (safe subset)
po_cols = [c for c in po_cols if c in df_bronze.columns]

df_po = df_bronze.select(*po_cols)

# Filter out deleted POs (keep where deletion indicator is null or empty)
df_po = df_po.filter(
    (F.col("po_deletionindicator").isNull()) | (F.col("po_deletionindicator") == "")
)

# Optional: filter out rows with no PO number at all
df_po = df_po.filter(F.col("purchaseorder").isNotNull())

# Add Silver metadata
df_po = (
    df_po
    .withColumn("silver_load_date", F.current_date())
    .withColumn("silver_load_timestamp", F.current_timestamp())
)

# ---------------------------------------------------------------------
# 4. BUILD PR SILVER DATASET
#    - Filter out deleted PRs
#    - Keep PR-related columns + shared keys
# ---------------------------------------------------------------------

pr_cols = [
    "purchaserequisition",
    "pr_itemnumber",
    "purchaseorder",
    "pr_documenttype",
    "pr_creationdate",
    "pr_createdby",
    "pr_createdby_name",
    "pr_requestforhiringsubcon",
    "pr_approvalstatuscode",
    "pr_approvalstatus",
    "pr_approveddate",
    "pr_processingstatuscode",
    "pr_processingstatus",
    "pr_companycode",
    "pr_companycodedesc",
    "pr_plant",
    "pr_plantdesc",
    "pr_orderqty",
    "pr_unitofmeasure",
    "pr_closedindicator",
    "pr_deletionindicator",
    "pr_openageing",
    "pr_approvalageing",
    "prtopoageing",
    "recordskipindicator",
    "lastchangedatetime"
]

pr_cols = [c for c in pr_cols if c in df_bronze.columns]

df_pr = df_bronze.select(*pr_cols)

# Filter out deleted PR (keep where deletion indicator is null or empty)
df_pr = df_pr.filter(
    (F.col("pr_deletionindicator").isNull()) | (F.col("pr_deletionindicator") == "")
)

# Optional: filter out rows with no PR number at all
df_pr = df_pr.filter(F.col("purchaserequisition").isNotNull())

# Add Silver metadata
df_pr = (
    df_pr
    .withColumn("silver_load_date", F.current_date())
    .withColumn("silver_load_timestamp", F.current_timestamp())
)

# ---------------------------------------------------------------------
# 5. WRITE PO & PR SILVER TABLES (DELTA)
# ---------------------------------------------------------------------
df_po.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable(po_silver_table)

df_pr.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable(pr_silver_table)
# ---------------------------------------------------------------------
# 6. REGISTER SILVER TABLES IN UNITY CATALOG
# ---------------------------------------------------------------------
print("PO Silver table:", po_silver_table)
print("PR Silver table:", pr_silver_table)


Bronze rows: 710
PO Silver table: abc.abc_dw_silver.abc_dw_sl_procurement_po
PR Silver table: abc.abc_dw_silver.abc_dw_sl_procurement_pr


In [0]:
from pyspark.sql import functions as F
from pyspark.sql.types import StringType, DateType
from pyspark.sql.window import Window

# ---------------------------------------------------------------------
# 0. BRONZE TABLE & SILVER TARGETS
# ---------------------------------------------------------------------

bronze_table = "abc.abc_dw_bronze.abc_dw_br_procurment_raw_data"

po_silver_path = "/Volumes/abc/abc_dw_silver/silver_dataset/procurement/po/"
pr_silver_path = "/Volumes/abc/abc_dw_silver/silver_dataset/procurement/pr/"

po_silver_table = "abc.abc_dw_silver.abc_dw_sl_pur_ord"
pr_silver_table = "abc.abc_dw_silver.abc_dw_sl_pr_req"

# ---------------------------------------------------------------------
# 1. READ BRONZE DATA
# ---------------------------------------------------------------------

df_bronze = spark.table(bronze_table)

# ---------------------------------------------------------------------
# 2. TRIM ALL STRING COLUMNS
# ---------------------------------------------------------------------

string_cols = [c for c, t in df_bronze.dtypes if t == "string"]

for col_name in string_cols:
    df_bronze = df_bronze.withColumn(col_name, F.trim(F.col(col_name)))

# ---------------------------------------------------------------------
# 3. CONVERT YYYYMMDD STRINGS TO DateType (yyyy-MM-dd)
# ---------------------------------------------------------------------

po_date_cols = [
    "po_createdon",
    "po_purchasingdocdate",
    "po_postingdate",
    "po_approvaldate",
    "po_deliverydate",
    "po_paymentdate",
    "po_invpostingdate",
    "po_invdocumentdate",
    "po_inventrydate",
    "po_latestinvpostingdate",
    "po_latestinvdocumentdate",
    "po_latestinventrydate"
]

pr_date_cols = [
    "pr_creationdate",
    "pr_approveddate"
]

date_cols = po_date_cols + pr_date_cols

for dc in date_cols:
    if dc in df_bronze.columns:
        df_bronze = df_bronze.withColumn(
            dc,
            F.when(
                (F.col(dc).isNotNull()) & (F.col(dc) != ""),
                F.try_to_date(F.col(dc), "yyyyMMdd")      # Use try_to_date for safe parsing
            ).otherwise(F.lit(None).cast(DateType()))
        )

# NOTE: po_createdon / lastchangedatetime are left as STRING.
# If you know their exact pattern, you can later parse them with to_timestamp.

# ---------------------------------------------------------------------
# 4. BUILD PO SILVER DATASET (FILTER NON-DELETED PO)
# ---------------------------------------------------------------------

po_cols = [
    "purchaseorder",
    "po_itemnumber",
    ("purchaserequisition", "po_purchaserequisition"),
    "po_serialnumber",
    "po_returnsitem",
    "po_purchasingdoctype",
    "po_purchasingdoctypedesc",
    "po_purchasingdocdate",
    "po_deletionindicator",
    "po_mastercontract",
    "po_purchasingorg",
    "po_purchasingorgdesc",
    "po_companycode",
    "po_companycodedesc",
    "po_purchasinggroup",
    "po_purchasinggroupdesc",
    "po_vendornumber",
    "po_vendorname",
    "po_vendoraccountgroup",
    "po_vendoraccountgroupdesc",
    "po_supplierclassification",
    "po_countrykey",
    "po_localorinternational",
    "po_plant",
    "po_plantdesc",
    "po_storageloc",
    "po_createdon",
    "po_createdby",
    "po_createdby_name",
    "po_documentyear",
    "po_accountassignmentcat",
    "po_accountassignmentcatdesc",
    "po_accountassignmentcatvalue",
    "po_accntassignmentcatvaluedesc",
    "po_version",
    "po_processingstatuscode",
    "po_processingstatus",
    "po_postingdate",
    "po_approvaldate",
    "po_paymentterms",
    "po_paymenttermsdesc",
    "material",
    "materialdesc",
    "materialgroup",
    "materialgroupdesc",
    "materialtype",
    "materialtypedesc",
    "po_deliverydate",
    "po_unitofmeasure",
    "po_orderpriceunit",
    "po_orderquantity",
    "po_priceunit",
    "po_netprice",
    "po_currency",
    "po_netordervalue",
    "po_exchangerate",
    "po_netamount",
    "po_deliveredqty",
    "po_deliveredqtyvalue",
    "po_pendingdeliveryqty",
    "po_pendingdeliveryvalue",
    "po_pendinginvoiceqty",
    "po_pendinginvoicevalue",
    "po_deliverycompleteindicator",
    "po_paymentstatus",
    "po_paymentagainstinvoice",
    "po_advancepayment",
    "po_paymentdate",
    "po_headerstatus",
    "po_itemstatus",
    "recordskipindicator",
    ("lastchangedatetime", "po_lastchangedatetime")
]

po_cols = [c for c in po_cols if c in df_bronze.columns]

df_po = df_bronze.select(*po_cols)

# Filter out deleted POs (keep null/empty deletion indicator)
df_po = df_po.filter(
    (F.col("po_deletionindicator").isNull()) | (F.col("po_deletionindicator") == "")
)

# Keep only rows with an actual PO number
df_po = df_po.filter(F.col("purchaseorder").isNotNull())

df_po = df_po.withColumn("silver_load_date", F.current_date()) \
             .withColumn("silver_load_timestamp", F.current_timestamp())

# ---------------------------------------------------------------------
# 5. BUILD PR SILVER DATASET (FILTER NON-DELETED PR)
# ---------------------------------------------------------------------

pr_cols = [
    ("purchaserequisition", "pr_purchaserequistion"),
    "pr_itemnumber",
    "pr_documenttype",
    "pr_creationdate",
    "pr_createdby",
    "pr_createdby_name",
    "pr_requestforhiringsubcon",
    "pr_approvalstatuscode",
    "pr_approvalstatus",
    "pr_approveddate",
    "pr_processingstatuscode",
    "pr_processingstatus",
    "pr_companycode",
    "pr_companycodedesc",
    "pr_plant",
    "pr_plantdesc",
    "pr_orderqty",
    "pr_unitofmeasure",
    "pr_closedindicator",
    "pr_deletionindicator",
    "pr_openageing",
    "pr_approvalageing",
    "prtopoageing",
    "recordskipindicator",
    ("lastchangedatetime", "pr_lastchangedatetime")
]
pr_cols = [c for c in pr_cols if c in df_bronze.columns]

df_pr = df_bronze.select(*pr_cols)

# Filter out deleted PR (keep where deletion indicator is null or empty)
df_pr = df_pr.filter(
    (F.col("pr_deletionindicator").isNull()) | (F.col("pr_deletionindicator") == "")
)

# Optional: filter out rows with no PR number at all
df_pr = df_pr.filter(F.col("purchaserequisition").isNotNull())

# Add Silver metadata
df_pr = (
    df_pr
    .withColumn("silver_load_date", F.current_date())
    .withColumn("silver_load_timestamp", F.current_timestamp())
)

# ---------------------------------------------------------------------
# 5. WRITE PO & PR SILVER TABLES (DELTA)
# ---------------------------------------------------------------------
df_po.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable(po_silver_table)

df_pr.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable(pr_silver_table)
# ---------------------------------------------------------------------
# 6. REGISTER SILVER TABLES IN UNITY CATALOG
# ---------------------------------------------------------------------


In [0]:
from pyspark.sql import functions as F
from pyspark.sql.types import DateType

# ---------------------------------------------------------------------
# 0. BRONZE SOURCE & SILVER TARGETS
# ---------------------------------------------------------------------

bronze_table = "abc.abc_dw_bronze.abc_dw_br_procurment_raw_data"

po_silver_table = "abc.abc_dw_silver.abc_dw_sl_pur_ord"
pr_silver_table = "abc.abc_dw_silver.abc_dw_sl_pr_req"

# ---------------------------------------------------------------------
# 1. READ BRONZE
# ---------------------------------------------------------------------

df = spark.table(bronze_table)

# ---------------------------------------------------------------------
# 2. TRIM ALL STRING COLUMNS
# ---------------------------------------------------------------------

for c, t in df.dtypes:
    if t == "string":
        df = df.withColumn(c, F.trim(F.col(c)))

# ---------------------------------------------------------------------
# 3. DATE CONVERSION (YYYYMMDD → yyyy-MM-dd)
# ---------------------------------------------------------------------

po_date_cols = [
    "po_createdon",
    "po_purchasingdocdate",
    "po_postingdate",
    "po_approvaldate",
    "po_deliverydate",
    "po_paymentdate",
    "po_invpostingdate",
    "po_invdocumentdate",
    "po_inventrydate",
    "po_latestinvpostingdate",
    "po_latestinvdocumentdate",
    "po_latestinventrydate"
]

pr_date_cols = [
    "pr_creationdate",
    "pr_approveddate"
]

date_cols = po_date_cols + pr_date_cols
for dc in date_cols:
    if dc in df.columns:
        df = df.withColumn(
            dc,
            F.when(
                (F.col(dc).isNotNull()) & (F.col(dc) != ""),
                F.try_to_date(F.col(dc), "yyyyMMdd")
            ).otherwise(F.lit(None).cast(DateType()))
        )

# ---------------------------------------------------------------------
# 4. PO SILVER — RENAME COLUMNS + FILTER NON-DELETED PO
# ---------------------------------------------------------------------

po_select = [
    F.col("purchaseorder"),
    F.col("po_itemnumber"),
    F.col("purchaserequisition").alias("po_purchaserequisition"),
    F.col("po_serialnumber"),
    F.col("po_returnsitem"),
    F.col("po_purchasingdoctype"),
    F.col("po_purchasingdoctypedesc"),
    F.col("po_purchasingdocdate"),
    F.col("po_deletionindicator"),
    F.col("po_purchasingorg"),
    F.col("po_purchasingorgdesc"),
    F.col("po_companycode"),
    F.col("po_companycodedesc"),
    F.col("po_purchasinggroup"),
    F.col("po_purchasinggroupdesc"),
    F.col("po_vendornumber"),
    F.col("po_vendorname"),
    F.col("po_vendoraccountgroup"),
    F.col("po_vendoraccountgroupdesc"),
    F.col("po_supplierclassification"),
    F.col("po_countrykey"),
    F.col("po_localorinternational"),
    F.col("po_plant"),
    F.col("po_plantdesc"),
    F.col("po_storageloc"),
    F.col("po_createdon"),
    F.col("po_createdby_name"),
    F.col("po_documentyear"),
    F.col("po_accountassignmentcat"),
    F.col("po_accountassignmentcatdesc"),
    F.col("po_accntassignmentcatvaluedesc"),
    F.col("po_version"),
    F.col("po_processingstatuscode"),
    F.col("po_processingstatus"),
    F.col("po_postingdate"),
    F.col("po_approvaldate"),
    F.col("po_deliverydate"),
    F.col("po_paymentdate"),
    F.col("material"),
    F.col("materialdesc"),
    F.col("materialgroup"),
    F.col("materialgroupdesc"),
    F.col("materialtype"),
    F.col("materialtypedesc"),
    F.col("po_orderquantity"),
    F.col("po_unitofmeasure"),
    F.col("po_netamount"),
    F.col("recordskipindicator"),
    F.col("lastchangedatetime").alias("po_lastchangedatetime")
]

df_po = df.select(*po_select)

df_po = df_po.filter(
    (F.col("po_deletionindicator").isNull()) | (F.col("po_deletionindicator") == "")
)

df_po = df_po.filter(F.col("purchaseorder").isNotNull())

df_po = df_po.withColumn("silver_load_date", F.current_date()) \
             .withColumn("silver_load_timestamp", F.current_timestamp())

# ---------------------------------------------------------------------
# 5. PR SILVER — RENAME COLUMNS + FILTER NON-DELETED PR
# ---------------------------------------------------------------------

pr_select = [
    F.col("purchaserequisition").alias("pr_purchaserequisition"),
    F.col("pr_itemnumber"),
    F.col("pr_documenttype"),
    F.col("pr_creationdate"),
    F.col("pr_createdby_name"),
    F.col("pr_approvalstatus"),
    F.col("pr_approveddate"),
    F.col("pr_processingstatus"),
    F.col("pr_companycode"),
    F.col("pr_companycodedesc"),
    F.col("pr_plant"),
    F.col("pr_plantdesc"),
    F.col("pr_orderqty"),
    F.col("pr_unitofmeasure"),
    F.col("pr_closedindicator"),
    F.col("pr_deletionindicator"),
    F.col("recordskipindicator"),
    F.col("lastchangedatetime").alias("pr_lastchangedatetime")
]

df_pr = df.select(*pr_select)

df_pr = df_pr.filter(
    (F.col("pr_deletionindicator").isNull()) | (F.col("pr_deletionindicator") == "")
)

df_pr = df_pr.filter(F.col("pr_purchaserequisition").isNotNull())

df_pr = df_pr.withColumn("silver_load_date", F.current_date()) \
             .withColumn("silver_load_timestamp", F.current_timestamp())

# ---------------------------------------------------------------------
# 6. WRITE SILVER TABLES
# ---------------------------------------------------------------------

df_po.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable(po_silver_table)

df_pr.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable(pr_silver_table)

print("Silver Layer Loaded Successfully.")


Silver Layer Loaded Successfully.
