In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, current_timestamp, lit, current_date()
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, TimestampType, BooleanType

# Initialize Spark session

# Define schema for the target table
schema = StructType([
    StructField("employee_id", IntegerType(), False),
    StructField("name", StringType(), False),
    StructField("department", StringType(), False),
    StructField("salary", IntegerType(), False),
    StructField("start_date", TimestampType(), False),
    StructField("end_date", TimestampType(), True),
    StructField("is_active", BooleanType(), False)
])


In [0]:
# Initial source data (batch)
source_data = [
    (1, "John Doe", "HR", 50000),
    (2, "Jane Smith", "IT", 60000)
]
source_df = spark.createDataFrame(source_data, ["employee_id", "name", "department", "salary"])
source_df.display()

target_table = "incremental_load.default.scd2_employee"
# Create target Delta table (empty initially) or load existing
try:
    target_df = spark.read.format("delta").table(target_table)
except:
    # If table doesn't exist, create an empty DataFrame with the schema
    target_df = spark.createDataFrame([], schema)
    target_df.write.format("delta").mode("overwrite").saveAsTable(target_table)
print("count of target")
print(target_df.count())

In [0]:
target_df = spark.read.format("delta").table(target_table)
active_target_df = target_df.filter(col("is_active") == True)
active_target_df.display()

In [0]:
join_condition = (
        source_df.employee_id == active_target_df.employee_id
    )
joined_df = source_df.join(
        active_target_df,
        join_condition,
        "left"
    )
joined_df.display()


In [0]:
new_or_changed_df = joined_df.filter(
        (active_target_df.employee_id.isNull()) |
        (source_df.department != active_target_df.department) |
        (source_df.salary != active_target_df.salary)
    ).select(
        source_df.employee_id,
        source_df.name,
        source_df.department,
        source_df.salary,
        current_date().alias("start_date"),
        lit(None).cast("timestamp").alias("end_date"),
        lit(True).alias("is_active")
    )
new_or_changed_df.display()
