In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from math import 
 

In [0]:
def get_delta_table_data(path):
    return spark.read.format("delta").load(path)


## Load Dimension tables

In [0]:
silver_conatiner_path = "abfss://silver@carsalesdatalake04ajaz.dfs.core.windows.net" 
gold_container_path = "abfss://gold@carsalesdatalake04ajaz.dfs.core.windows.net" 


dim_branch = get_delta_table_data(gold_container_path+'/branch_dimension')
dim_date = get_delta_table_data(gold_container_path+'/date_dimension')
dim_model = get_delta_table_data(gold_container_path+'/model_dimension')
dim_dealer = get_delta_table_data(gold_container_path+'/dealer_dimension')


## Get incremental data from silver table

In [0]:
gold_table_name = "carsales_catalog.gold.fact_car_sales"
fact_car_sales_schema = StructType([
    StructField("dim_model_key", LongType(), False),
    StructField("dim_branch_key", LongType(), False),
    StructField("dim_date_key", LongType(), False),
    StructField("dim_dealer_key", LongType(), False),
    StructField("Revenue", LongType(), False),
    StructField("UnitsSold", LongType(), False),
    StructField("RevenuePerUnit", DoubleType(), True),
])

is_incremental = dbutils.widgets.get("is_incremental").lower() == "true"
filter_condition = True
if is_incremental and spark.catalog.tableExists(gold_table_name):
    last_load_on_gold = spark.sql(f"DESCRIBE HISTORY {gold_table_name}")\
                                .select(max("timestamp")).collect()[0][0]
    filter_condition = col("silver_load_timestamp") > last_load_on_gold
    
silver_fact_df = spark.read.format("delta")\
                        .load(silver_conatiner_path+"/transformed_data")\
                        .filter(filter_condition)


## Building fact table

In [0]:
src_fact_df = silver_fact_df\
                .join(dim_model, "ModelId", "left")\
                .join(dim_branch, "BranchId", "left")\
                .join(dim_date, "DateId", "left")\
                .join(dim_dealer, "DealerId", "left")\
                .select(
                    dim_model["dim_model_key"], 
                    dim_branch["dim_branch_key"], 
                    dim_date["dim_date_key"], 
                    dim_dealer["dim_dealer_key"], 
                    silver_fact_df["Revenue"],
                    silver_fact_df["UnitsSold"],
                    )\
                .withColumn("revenuePerUnit", round(silver_fact_df["Revenue"]/silver_fact_df["UnitsSold"], 2))

In [0]:
sink_fact_df = spark.createDataFrame([], fact_car_sales_schema)
if spark.catalog.tableExists('carsales_catalog.gold.fact_car_sales'):
    sink_fact_df = spark.table('carsales_catalog.gold.fact_car_sales')

In [0]:
if is_incremental and spark.catalog.tableExists('carsales_catalog.gold.fact_car_sales'):
    src_fact_df.write\
        .format("delta")\
        .mode("append")\
        .option("path", gold_container_path+"/car_sales_fact")\
        .saveAsTable("carsales_catalog.gold.fact_car_sales")
else: 
    src_fact_df.write\
        .format("delta")\
        .mode("overwrite")\
        .option("path", gold_container_path+"/car_sales_fact")\
        .saveAsTable("carsales_catalog.gold.fact_car_sales")