In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, current_date, to_date, when
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType, BooleanType
from datetime import date

# Initialize Spark session
spark = SparkSession.builder.appName("SCD_Type2_LeftJoin_Example").getOrCreate()

# Define schema for the existing customer dimension table (target)
customer_schema = StructType([
    StructField("customer_id", IntegerType(), False),
    StructField("name", StringType(), True),
    StructField("address", StringType(), True),
    StructField("email", StringType(), True),
    StructField("effective_date", DateType(), True),
    StructField("end_date", DateType(), True),
    StructField("is_current", BooleanType(), True)
])

# Sample data for existing customer dimension table using Python date objects
existing_data = [
    (1, "John Doe", "123 Old St", "john@example.com", date(2023, 1, 1), None, True),
    (2, "Jane Smith", "456 Old Ave", "jane@example.com", date(2023, 1, 1), None, True)
]

# Create DataFrame for existing customer table
customer_df = spark.createDataFrame(existing_data, customer_schema)
print("customer_df")
customer_df.display()
# Define schema for incoming updates (source)
update_schema = StructType([
    StructField("customer_id", IntegerType(), False),
    StructField("name", StringType(), True),
    StructField("address", StringType(), True),
    StructField("email", StringType(), True),
    StructField("update_date", DateType(), True)
])

# Sample incoming data with updates using Python date objects
update_data = [
    (1, "John Doe", "789 New St", "john.doe@example.com", date(2023, 6, 1)),  # Address and email changed
    (2, "Jane Smith", "456 Old Ave", "jane@example.com", date(2023, 6, 1)),   # No change
    (3, "Bob Wilson", "101 New Rd", "bob@example.com", date(2023, 6, 1))      # New customer
]

# Create DataFrame for incoming updates
updates_df = spark.createDataFrame(update_data, update_schema)
print("updates_df")
updates_df.display()



In [0]:
joined_df = updates_df.join(
    customer_df,
    (updates_df.customer_id == customer_df.customer_id) & (customer_df.is_current == True),
    "left_outer"
)
print("joined_df")
joined_df.display()

filtered_df = joined_df.filter(
    # New records (no match in customer_df) or changed records
    (customer_df.customer_id.isNull()) |
    (updates_df.address != customer_df.address) |
    (updates_df.email != customer_df.email)
).select(
    updates_df.customer_id,
    updates_df.name,
    updates_df.address,
    updates_df.email,
    updates_df.update_date.alias("effective_date"),
    lit(None).cast(DateType()).alias("end_date"),
    lit(True).alias("is_current")
)

filtered_df.display()

In [0]:
expired_data = joined_df.filter((customer_df.customer_id.isNotNull()) &
    ((updates_df.address != customer_df.address) | (updates_df.email != customer_df.email)))\
    .select(
    customer_df.customer_id,
    customer_df.name,
    customer_df.address,
    customer_df.email,
    customer_df.effective_date,
    updates_df.update_date.alias("end_date"),
    lit(False).alias("is_current"))
expired_data.display()

In [0]:
unchanged_records = customer_df.join(
    filtered_df,
    customer_df.customer_id == updates_df.customer_id,
    "left_anti"
).select(
    customer_df.customer_id,
    customer_df.name,
    customer_df.address,
    customer_df.email,
    customer_df.effective_date,
    customer_df.end_date,
    customer_df.is_current
)
unchanged_records.display()


In [0]:
final_df = filtered_df.union(expired_data).union(unchanged_records)
final_df.display()
final_df.orderBy("customer_id", "effective_date").display()

### Source DataFrame (latest snapshot from source system)

In [0]:
new_source_data = [
    (101, "Alice Smith", "Chicago"),  # City changed
    (102, "Bob Brown", "Seattle")     # No change
]
df_new_source = spark.createDataFrame(new_source_data, ["CustomerID", "Name", "City"])
df_new_source.display()

###  Target Dimension DataFrame (Customer_Dim)

In [0]:
from pyspark.sql.functions import current_date, lit

# Initial load - simulate source
initial_source_data = [
    (101, "Alice Smith", "Boston"),
    (102, "Bob Brown", "Seattle")
]

df_initial_source = spark.createDataFrame(initial_source_data, ["CustomerID", "Name", "City"])

# Add SCD Type 2 columns
df_initial_target = df_initial_source\
    .withColumn("StartDate", current_date()) \
    .withColumn("EndDate", lit("9999-12-31").cast("date")) \
    .withColumn("IsCurrent", lit(True))

# Reorder columns
df_initial_target = df_initial_target.select(
    "CustomerID", "Name", "City", "StartDate", "EndDate", "IsCurrent"
)

df_initial_target.display()


In [0]:
joined_df = df_new_source.join(df_initial_target, (df_new_source.CustomerID == df_initial_target.CustomerID) & (df_initial_target.IsCurrent == True), "left_outer")
joined_df.display()

changed_df = joined_df.filter((df_initial_target.CustomerID.isNull()) | (df_new_source.City != df_initial_target.City) | (df_new_source.Name != df_initial_target.Name)).select(
    df_new_source.CustomerID,
    df_new_source.Name,
    df_new_source.City,
    date_add(current_date(),1).alias("StartDate"),
    lit("9999-12-31").cast("date").alias("EndDate"),
    lit(True).alias("IsCurrent")
)
changed_df.display()


In [0]:
from pyspark.sql.functions import current_date, lit, date_sub, date_add



In [0]:
expired_df = joined_df.filter((df_initial_target.CustomerID.isNotNull()) & (df_new_source.City != df_initial_target.City) | (df_new_source.Name != df_initial_target.Name))
expired_df.display()

expired_df = expired_df.select(
    df_initial_target.CustomerID,
    df_initial_target.Name,
    df_initial_target.City,
    df_initial_target.StartDate,
    current_date().alias("EndDate"),
    lit(False).alias("IsCurrent")
)
expired_df.display()

In [0]:
unchanged_records = df_initial_target.join(changed_df, "CustomerID", "left_anti")
unchanged_records.display()

final_df = unchanged_records.unionByName(changed_df).unionByName(expired_df)
final_df.display()

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, DateType, BooleanType
from datetime import date

spark = SparkSession.builder.getOrCreate()

# Initial dimension data with historical StartDate
initial_data = [
    (1, 1001, "iPhone 14", "Mobile", 999.0, date(2024, 1, 1), date(9999, 12, 31), True),
    (2, 1002, "Galaxy S23", "Mobile", 849.0, date(2024, 1, 1), date(9999, 12, 31), True),
    (3, 1003, "Dell XPS 13", "Laptop", 1199.0, date(2024, 1, 1), date(9999, 12, 31), True)
]

schema = StructType([
    StructField("SurrogateKey", IntegerType(), False),
    StructField("ProductID", IntegerType(), False),
    StructField("ProductName", StringType(), False),
    StructField("Category", StringType(), False),
    StructField("Price", DoubleType(), False),
    StructField("StartDate", DateType(), False),
    StructField("EndDate", DateType(), False),
    StructField("IsCurrent", BooleanType(), False)
])

df_dim_initial = spark.createDataFrame(initial_data, schema)
df_dim_initial.display()


In [0]:
# New snapshot from source (today's data)
new_products = [
    (1001, "iPhone 14", "Mobile", 999.0),       # No change
    (1002, "Galaxy S23", "Mobile", 899.0),      # Price updated
    (1003, "Dell XPS 13", "Laptop", 1199.0)     # No change
]

df_source = spark.createDataFrame(new_products, ["ProductID", "ProductName", "Category", "Price"])
df_source.display()


In [0]:
join_condition = (
    (df_source.ProductID  ==  df_dim_initial.ProductID) & (df_dim_initial.IsCurrent == True)
)
joined_df  = df_source.join(df_dim_initial, join_condition
                            , "left_outer")
joined_df.display()

changed_df = joined_df.filter((df_dim_initial.ProductID.isNull()) | (df_source.Price != df_dim_initial.Price))
changed_df.display()

changed_df = changed_df.select(
    df_source.ProductID.alias("ProductID"),
    df_source.ProductName.alias("ProductName"),
    df_source.Category.alias("Category"),
    df_source.Price.alias("Price"),
    date_add(current_date(),1).alias("StartDate"),
    lit("9999-12-31").cast("date").alias("EndDate"),
    lit(True).alias("IsCurrent")
)
changed_df.display()

In [0]:
expired_df = joined_df.filter((df_dim_initial.ProductID.isNotNull()) & (df_source.Price != df_dim_initial.Price))
expired_df.display()
expired_df = expired_df.select(
    df_dim_initial.ProductID.alias("ProductID"),
    df_dim_initial.ProductName.alias("ProductName"),
    df_dim_initial.Category.alias("Category"),
    df_dim_initial.Price.alias("Price"),
    df_dim_initial.StartDate.alias("StartDate"),
    current_date().alias("EndDate"),
    lit(False).alias("IsCurrent")
)
expired_df.display()

In [0]:
unchanged_records = df_dim_initial.join(changed_df, "ProductID", "left_anti").drop("SurrogateKey")
unchanged_records.display()
final_df = unchanged_records.unionByName(changed_df).unionByName(expired_df)
final_df.display()