In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, when, lag, lead
from pyspark.sql.window import Window

# Initialize Spark session
spark = SparkSession.builder.appName("SCDType2Example").getOrCreate()

# Sample data for source-1-df
data1 = [
    (1, 'A', '2024-01-14', 60),
    (2, 'B', '2024-04-17', 34),
    (3, 'C', '2024-04-09', 35),
    (4, 'D', '2024-04-15', 67),
    (10, 'Z', '2023-12-25', 109)
]

# Sample data for source-2-df
data2 = [
    (11, 'U', '2024-02-28', 101),
    (1, 'A', '2024-02-28', 2000),
    (1, 'A', '2024-02-18', 12)
]

# Create DataFrames
df1 = spark.createDataFrame(data1, ['ID', 'Product', 'Transaction_date', 'Price'])
df2 = spark.createDataFrame(data2, ['ID', 'Product', 'Transaction_date', 'Price'])

# Convert Transaction_date to DateType
df1 = df1.withColumn("Transaction_date", col("Transaction_date").cast("date"))
df2 = df2.withColumn("Transaction_date", col("Transaction_date").cast("date"))

# Combine DataFrames
df_combined = df1.union(df2)

# Create a window specification
window_spec = Window.partitionBy("ID", "Product").orderBy(col("Transaction_date"))

# Add row number to identify the latest record
df_combined = df_combined.withColumn("row_num", row_number().over(window_spec))

# Add lead and lag to get the previous and next transaction dates
df_combined = df_combined.withColumn("prev_date", lag("Transaction_date").over(window_spec))
df_combined = df_combined.withColumn("next_date", lead("Transaction_date").over(window_spec))

# Identify the active and historical records
df_combined = df_combined.withColumn("is_current", when(col("next_date").isNull(), lit(1)).otherwise(lit(0)))

# Create start_date and end_date for the records
df_combined = df_combined.withColumn("start_date", col("Transaction_date"))
df_combined = df_combined.withColumn("end_date", when(col("is_current") == 1, lit(None)).otherwise(col("next_date") - 1))

# Select the required columns
result_df = df_combined.select("ID", "Product", "Price", "start_date", "end_date", "is_current")

# Show the result
result_df.show()


+---+-------+-----+----------+----------+----------+
| ID|Product|Price|start_date|  end_date|is_current|
+---+-------+-----+----------+----------+----------+
|  1|      A|   60|2024-01-14|2024-02-17|         0|
|  1|      A|   12|2024-02-18|2024-02-27|         0|
|  1|      A| 2000|2024-02-28|      null|         1|
|  2|      B|   34|2024-04-17|      null|         1|
|  3|      C|   35|2024-04-09|      null|         1|
|  4|      D|   67|2024-04-15|      null|         1|
| 10|      Z|  109|2023-12-25|      null|         1|
| 11|      U|  101|2024-02-28|      null|         1|
+---+-------+-----+----------+----------+----------+

