In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, current_timestamp, cast
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType
from delta.tables import DeltaTable

# Initialize Spark session
spark = SparkSession.builder.appName("SCD_Type1_Delta").getOrCreate()

# Define explicit schema to avoid type inference issues
schema = StructType([
    StructField("customer_id", IntegerType(), False),
    StructField("name", StringType(), True),
    StructField("email", StringType(), True),
    StructField("phone", StringType(), True),
    StructField("last_updated", TimestampType(), True)
])

# Initial records
initial_data = [
    (1, "John Smith", "john.smith@example.com", "123-456-7890"),
    (2, "Jane Doe", "jane.doe@example.com", "987-654-3210"),
    (3, "Bob Wilson", "bob.w@example.com", "555-555-5555")
]

# Create initial DataFrame with explicit schema
initial_df = spark.createDataFrame(initial_data, ["customer_id", "name", "email", "phone"]) \
    .withColumn("last_updated", current_timestamp()) \
    .select("customer_id", "name", "email", "phone", "last_updated") \
    .cast(schema)
print("initial_df")
display(initial_df)
# Target Delta table path
delta_table_path = "incremental_load.default.customer_scd1"

# Create or overwrite Delta table with initial data
try:
    initial_df.write.format("delta").mode("overwrite").saveAsTable(delta_table_path)
except Exception as e:
    print(f"Error creating Delta table: {str(e)}")
    raise e

# Load Delta table
try:
    delta_table = DeltaTable.forName(spark, delta_table_path)
except Exception as e:
    print(f"Error loading Delta table: {str(e)}")
    raise e

# Display initial records
print("Initial Records:")
spark.read.format("delta").table(delta_table_path)

# Update records (new/updated data)
update_data = [
    (1, "John Doe", "john.doe@example.com", "123-456-7890"),  # Updated name, email
    (2, "Jane Smith", "jane.smith@example.com", "987-654-3210"),  # Updated name, email
    (4, "Alice Brown", "alice.b@example.com", "111-222-3333")  # New record
]

# Create update DataFrame with same schema
update_df = spark.createDataFrame(update_data, ["customer_id", "name", "email", "phone"]) \
    .withColumn("last_updated", current_timestamp()) \
    .select("customer_id", "name", "email", "phone", "last_updated")
print("update_df")
display(update_df)

# Perform SCD Type 1 merge
try:
    delta_table.alias("target").merge(
        update_df.alias("source"),
        "target.customer_id = source.customer_id"
    ).whenMatchedUpdate(set={
        "name": "source.name",
        "email": "source.email",
        "phone": "source.phone",
        "last_updated": "source.last_updated"
    }).whenNotMatchedInsert(values={
        "customer_id": "source.customer_id",
        "name": "source.name",
        "email": "source.email",
        "phone": "source.phone",
        "last_updated": "source.last_updated"
    }).execute()
except Exception as e:
    print(f"Error during merge operation: {str(e)}")
    raise e

# Optimize Delta table
try:
    delta_table.vacuum()
    delta_table.optimize().executeCompaction()
except Exception as e:
    print(f"Error during table optimization: {str(e)}")

# Display final table
print("Final Records after Update:")
final_df = spark.read.format("delta").table(delta_table_path)
final_df.display()

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

spark = SparkSession.builder.appName("SCD Type 1").getOrCreate()

# Simulate source and target
df_source = spark.createDataFrame([
    (1, "John Doe", "john.doe@new.com", "123 Elm St"),
    (3, "Bob Smith", "bob@example.com", "900 Pine Street")
], ["customer_id", "name", "email", "address"])
print("source")
df_source.display()
df_target = spark.createDataFrame([
    (1, "John Doe", "john@example.com", "123 Elm St"),
    (2, "Alice Lee", "alice@old.com", "500 Maple Ave")
], ["customer_id", "name", "email", "address"])
print("target")
df_target.display()
# Step 1: Changed records (compare tracked columns)
df_changed = df_source.alias("src").join(df_target.alias("tgt"), "customer_id") \
    .filter(
        (col("src.name") != col("tgt.name")) |
        (col("src.email") != col("tgt.email")) |
        (col("src.address") != col("tgt.address"))
    ).select("src.*")
print("changed")
df_changed.display()
# Step 2: New records
df_new = df_source.alias("src").join(df_target.alias("tgt"), "customer_id", "left_anti")
print("new")
df_new.display()
# Step 3: Union of new and changed records
df_upserts = df_changed.union(df_new)
print("df_upserts")
df_upserts.display()
# Step 4: Remove old versions
df_remaining = df_target.join(df_upserts, "customer_id", "left_anti")
print("remaining")
df_remaining.display()
# Step 5: Final dimension table
df_final = df_remaining.union(df_upserts)
print("final_df")
# Show result
df_final.display()
