## Fetching new data from silver layer

In [0]:
from pyspark.sql.functions import *  
from pyspark.sql.types import *
from delta.tables import DeltaTable

gold_table_name = "carsales_catalog.gold.car_model_dimension"
gold_table_schema = StructType([
    StructField("dim_model_key", LongType(), True),
    StructField("ModelId", StringType(), False),
    StructField("ModelCategory", StringType(), False),
])

is_incremental = dbutils.widgets.get("is_incremental").lower() == "true"
filter_condition = True
if is_incremental and spark.catalog.tableExists(gold_table_name):
    last_load_on_gold = spark.sql(f"DESCRIBE HISTORY {gold_table_name}")\
                                .select(max("timestamp")).collect()[0][0]
    filter_condition = col("silver_load_timestamp") > last_load_on_gold
    
path = "abfss://silver@carsalesdatalake04ajaz.dfs.core.windows.net/transformed_data"
silver_df = spark.read.format("delta").load(path).filter(filter_condition)

car_model_src = silver_df.select("ModelId")\
                        .withColumn("ModelCategory", split("ModelId","-")[0]).distinct()
car_model_src.head(5) 


## Creating Surrogate key for new records after segregation

### Getting existing table - car model dim 

In [0]:
max_surrogate_key = 1

if spark.catalog.tableExists(gold_table_name):
    gold_model_df = spark.table(gold_table_name)
    max_surrogate_key = gold_model_df.select(max("dim_model_key")).collect()[0][0]
    print(max_surrogate_key)
else:
    gold_model_df = spark.createDataFrame([], gold_table_schema)

### Filter new data and updated data to prepare upserted data

In [0]:

dim_model_new_df = car_model_src\
    .join(gold_model_df, "ModelId", "leftanti")\
    .withColumn("dim_model_key", lit(max_surrogate_key + monotonically_increasing_id()))
dim_model_updated_df = car_model_src.alias('src')\
    .join(gold_model_df.alias('sink'), "ModelId", "left")\
    .filter(col("dim_model_key").isNotNull())\
    .select("src.ModelId", "src.ModelCategory", "sink.dim_model_key")

car_model_staging_df = dim_model_new_df.union(dim_model_updated_df)
car_model_staging_df.display()


## Implement SCD-1 by Upserting Changes to Delta lake, 

In [0]:
adls_gold_path = "abfss://gold@carsalesdatalake04ajaz.dfs.core.windows.net/model_dimension"
if is_incremental and spark.catalog.tableExists(gold_table_name):
    sink_df = DeltaTable.table(gold_table_name)
    sink_df\
        .merge(car_model_staging_df, sink_df['dim_model_key'] == car_model_staging_df['dim_model_key'])\
        .whenMatchedUpdateAll()\
        .whenNotMatchedInsertAll()\
        .execute()
else:
    car_model_staging_df.write.format('delta')\
        .mode('overwrite')\
        .option("path", adls_gold_path)\
        .saveAsTable(gold_table_name)

## Testing

In [0]:
spark.sql("select * from carsales_catalog.gold.car_model_dimension limit 10").head(5)