### Fact Table for Purchase Order

In [0]:
from pyspark.sql.functions import *
from delta.tables import DeltaTable

In [0]:
# load required tables
load_type = 'full'
if load_type == 'incremental':
  ingest_date = spark.sql('select max(ingestion_date) as max_date from inventory_project.silver.erp_purchase_order_silver').collect()[0]['max_date']
  condition = col("ingestion_date") >= lit(ingest_date)
  po_source = spark.read.table('inventory_project.silver.erp_purchase_order_silver')\
      .filter(condition)
else:
  po_source = spark.read.table('inventory_project.silver.erp_purchase_order_silver')
# loading dim tables
prod_source = spark.read.table('inventory_project.gold.erp_product_dim')
supp_source = spark.read.table('inventory_project.gold.erp_supplier_dim')

In [0]:
display(po_source)

In [0]:
fact_df = po_source.alias('po')\
    .join(supp_source.alias('supp'), col('po.supplier_id') == col('supp.supplier_id'), 'left')\
    .join(prod_source.alias('prod'), col('po.product_id') == col('prod.product_id'), 'left')\
    .withColumn('fact_ingest_date', current_timestamp())\
    .select(
        col('po.po_id'),
        coalesce(col("supp.supplier_key"), lit('NA')).alias("supplier_key"),
        coalesce(col("prod.product_key"), lit('NA')).alias("product_key"),
        col('po.qty_ordered'),
        col('po.order_date'),
        col('po.expected_date'),
        col('po.status'),
        col('fact_ingest_date')
    )

In [0]:
# load data to fact_table
if load_type == 'incremental':
  target_table = DeltaTable.forName(spark, 'inventory_project.gold.erp_purchase_order_fact')
  target_table.alias('tgt')\
    .merge(
      fact_df.alias('src'),
      'tgt.po_id = src.po_id and tgt.supplier_key = src.supplier_key and tgt.product_key = src.product_key'
    )\
    .whenMatchedUpdateAll()\
    .whenNotMatchedInsertAll()
else:
  fact_df.write.mode('overwrite').saveAsTable('inventory_project.gold.erp_purchase_order_fact')

In [0]:
display(fact_df)