In [1]:
%run NB - Data Producer with Null

StatementMeta(, e6f462e3-cd83-4c27-96e5-0e13b1c8db7d, 6, Finished, Available, Finished)

Collecting faker
  Downloading faker-37.5.3-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.5.3-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m0m
[?25hInstalling collected packages: faker
Successfully installed faker-37.5.3
Note: you may need to restart the kernel to use updated packages.


SynapseWidget(Synapse.DataFrame, ea5f9976-0fa8-4789-85aa-a999d51db56f)

In [2]:
from pyspark.sql.functions import col, lit, lag, when, sha2, concat_ws, lead, current_date
from pyspark.sql.window import Window

StatementMeta(, e6f462e3-cd83-4c27-96e5-0e13b1c8db7d, 7, Finished, Available, Finished)

In [3]:
# Replace NULL LoadDate with current date
df_null = df_null.withColumn(
    "LoadDate",
    when(col("LoadDate").isNull(), current_date()).otherwise(col("LoadDate"))
)

display(df_null)

StatementMeta(, e6f462e3-cd83-4c27-96e5-0e13b1c8db7d, 8, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 5d42478b-c945-49dc-af22-0e44776ad7f3)

In [4]:
# Remove duplicate records (same EmpID, LoadDate, Name, JobTitle, Department)
df_null = df_null.dropDuplicates(["EmpID", "LoadDate", "Name", "JobTitle", "Department"])
display(df_null)

StatementMeta(, e6f462e3-cd83-4c27-96e5-0e13b1c8db7d, 9, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 1b7b2f81-50c4-4887-9a4a-d98f1b71d453)

In [5]:
# Create a hash of the tracking columns
df_hashed = df_null.withColumn(
    "row_sha2", 
    sha2(concat_ws("||", col("Name"), col("JobTitle"), col("Department")), 256)
)
display(df_hashed)

StatementMeta(, e6f462e3-cd83-4c27-96e5-0e13b1c8db7d, 10, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 6cb860ad-1d7f-4bbe-b247-f2f6730ea04c)

In [6]:
# Define window partitioned by EmpID and ordered by LoadDate
window_spec = Window.partitionBy("EmpID").orderBy("LoadDate")

# Detect changes
df_with_lag = df_hashed.withColumn(
    "prev_hash", lag("row_sha2").over(window_spec)
).withColumn(
    "change_flag", when(col("row_sha2") != col("prev_hash"), lit(1)).otherwise(lit(0))
).withColumn(
    "change_flag", when(col("prev_hash").isNull(), lit(1)).otherwise(col("change_flag"))
)

display(df_with_lag)

StatementMeta(, e6f462e3-cd83-4c27-96e5-0e13b1c8db7d, 11, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 1bb596f2-f578-49d2-bdd4-0e53054335e5)

In [7]:
# Filter only change rows
df_changes = df_with_lag.filter(col("change_flag") == 1).drop("prev_hash", "change_flag")
display(df_changes)

StatementMeta(, e6f462e3-cd83-4c27-96e5-0e13b1c8db7d, 12, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, a5665fec-0428-4cea-af0b-2d68899760da)

In [8]:
# Add StartDate
df_changes = df_changes.withColumn("StartDate", col("LoadDate"))
display(df_changes)

StatementMeta(, e6f462e3-cd83-4c27-96e5-0e13b1c8db7d, 13, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, fba7310f-8b78-42d2-a5cb-32af768fb54f)

In [9]:
# Define EndDate
window_spec_ver = Window.partitionBy("EmpID").orderBy("StartDate")
df_changes = df_changes.withColumn(
    "EndDate", 
    lead("StartDate").over(window_spec_ver)
)
display(df_changes)

StatementMeta(, e6f462e3-cd83-4c27-96e5-0e13b1c8db7d, 14, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 926917dc-02a9-4b4e-8410-d29ee5629fd2)

In [10]:
# Set IsActive
df_changes = df_changes.withColumn(
    "IsActive", when(col("EndDate").isNull(), lit(True)).otherwise(lit(False))
)
display(df_changes)

StatementMeta(, e6f462e3-cd83-4c27-96e5-0e13b1c8db7d, 15, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, c81496fa-8214-4430-8540-599645f788bf)

In [11]:
# Final SCD2 table
scd2_final = df_changes.select(
    "EmpID", "Name", "Gender", "JobTitle", "Department", 
    "StartDate", "EndDate", "IsActive"
)
display(scd2_final)

StatementMeta(, e6f462e3-cd83-4c27-96e5-0e13b1c8db7d, 16, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 5445a1fc-1293-4b95-a653-15ec49aecfe5)

In [12]:
df = scd2_final.filter(scd2_final["IsActive"] == "true")
display(df)

StatementMeta(, e6f462e3-cd83-4c27-96e5-0e13b1c8db7d, 17, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 19ebd571-fddc-4299-9ff5-d71dffb9b751)