# Incremental Refresh In SCD Type 2 On Delta Table - Merge Method

In [0]:
#Import session
from pyspark.sql import SparkSession

In [0]:
%sql
drop table if exists customer;

create table customer(
  Customer_Id int,
  Name varchar(50),
  Age int,
  DOB date,
  City varchar(50),
  District varchar(50),
  State varchar(50),
  Flag varchar(50),
  Start_Date timestamp,
  End_Date timestamp
)
using delta
location '/delta/customer'

In [0]:
spark.sql('delete from customer')

Out[3]: DataFrame[num_affected_rows: bigint]

In [0]:
spark.sql('select * from customer').show()

+-----------+----+---+---+----+--------+-----+----+----------+--------+
|Customer_Id|Name|Age|DOB|City|District|State|Flag|Start_Date|End_Date|
+-----------+----+---+---+----+--------+-----+----+----------+--------+
+-----------+----+---+---+----+--------+-----+----+----------+--------+



In [0]:
%sql

INSERT INTO customer (Customer_Id, Name, Age, DOB, City, District, State, Flag, Start_Date, End_Date) VALUES
(1, 'Ajimal', 23, '2001-07-01', 'Gobi', 'Erode', 'TamilNadu', 'Y', current_timestamp(), '9999-12-31'),
(2, 'Jervin', 23, '2001-10-03', 'Mylode', 'Kanyakumari', 'TamilNadu', 'Y', current_timestamp(), '9999-12-31'),
(3, 'Bergin', 24, '2000-01-20', 'Pollachi', 'Coimbatore', 'TamilNadu', 'Y', current_timestamp(), '9999-12-31'),
(4, 'Nishanth', 24, '2000-11-25', 'Tambaram', 'Chennai', 'TamilNadu', 'Y', current_timestamp(), '9999-12-31'),
(5, 'Vasanth', 24, '2001-03-14', 'Poonamalle', 'Chennai', 'TamilNadu', 'Y', current_timestamp(), '9999-12-31');


num_affected_rows,num_inserted_rows
5,5


In [0]:
from delta.tables import DeltaTable

# Load the Delta table
target_table = DeltaTable.forPath(spark, '/delta/customer')

# Convert to DataFrame
target_df = target_table.toDF()

# Display the DataFrame in Databricks
target_df.display()


Customer_Id,Name,Age,DOB,City,District,State,Flag,Start_Date,End_Date
1,Ajimal,23,2001-07-01,Gobi,Erode,TamilNadu,Y,2024-08-10T05:04:06.372+0000,9999-12-31T00:00:00.000+0000
2,Jervin,23,2001-10-03,Mylode,Kanyakumari,TamilNadu,Y,2024-08-10T05:04:06.372+0000,9999-12-31T00:00:00.000+0000
3,Bergin,24,2000-01-20,Pollachi,Coimbatore,TamilNadu,Y,2024-08-10T05:04:06.372+0000,9999-12-31T00:00:00.000+0000
4,Nishanth,24,2000-11-25,Tambaram,Chennai,TamilNadu,Y,2024-08-10T05:04:06.372+0000,9999-12-31T00:00:00.000+0000
5,Vasanth,24,2001-03-14,Poonamalle,Chennai,TamilNadu,Y,2024-08-10T05:04:06.372+0000,9999-12-31T00:00:00.000+0000


In [0]:
#Import necessary datatype 
from pyspark.sql.functions import *
from pyspark.sql.types import *
from datetime import *

#Create schema for streaming data
schema = StructType(
    [StructField('Customer_Id', IntegerType(), True),
    StructField('Name', StringType(), True),
    StructField('Age', IntegerType(), True),
    StructField('DOB', DateType(), True),
    StructField('City', StringType(), True),
    StructField('District', StringType(), True),
    StructField('State', StringType(), True)]
)

#Data
data = [
    (1, 'Ajimal', 23, date(2001,7,1), 'Mylode', 'Kanyakumari', 'TamilNadu'),
    (2, 'Jervin', 23, date(2001,10,3), 'Mylode', 'Kanyakumari', 'TamilNadu'),
    (6, 'Aravindh', 22, date(2001,3,31), 'TM Palayam', 'Coimbatore', 'TamilNadu'),
    (7, 'Pavithran', 24, date(2000,11,20), 'Serupanacheri', 'Chennai', 'TamilNadu')
    ]

source_df = spark.createDataFrame(data, schema)
source_df.show()

+-----------+---------+---+----------+-------------+-----------+---------+
|Customer_Id|     Name|Age|       DOB|         City|   District|    State|
+-----------+---------+---+----------+-------------+-----------+---------+
|          1|   Ajimal| 23|2001-07-01|       Mylode|Kanyakumari|TamilNadu|
|          2|   Jervin| 23|2001-10-03|       Mylode|Kanyakumari|TamilNadu|
|          6| Aravindh| 22|2001-03-31|   TM Palayam| Coimbatore|TamilNadu|
|          7|Pavithran| 24|2000-11-20|Serupanacheri|    Chennai|TamilNadu|
+-----------+---------+---+----------+-------------+-----------+---------+



In [0]:
join_df = source_df.join(target_df, (source_df.Customer_Id == target_df.Customer_Id)\
    & (target_df.Flag == 'Y'), 'leftouter'
    )
join_df.display()

Customer_Id,Name,Age,DOB,City,District,State,Customer_Id.1,Name.1,Age.1,DOB.1,City.1,District.1,State.1,Flag,Start_Date,End_Date
1,Ajimal,23,2001-07-01,Mylode,Kanyakumari,TamilNadu,1.0,Ajimal,23.0,2001-07-01,Gobi,Erode,TamilNadu,Y,2024-08-10T05:04:06.372+0000,9999-12-31T00:00:00.000+0000
2,Jervin,23,2001-10-03,Mylode,Kanyakumari,TamilNadu,2.0,Jervin,23.0,2001-10-03,Mylode,Kanyakumari,TamilNadu,Y,2024-08-10T05:04:06.372+0000,9999-12-31T00:00:00.000+0000
6,Aravindh,22,2001-03-31,TM Palayam,Coimbatore,TamilNadu,,,,,,,,,,
7,Pavithran,24,2000-11-20,Serupanacheri,Chennai,TamilNadu,,,,,,,,,,


In [0]:
join_df = source_df.join(target_df, (source_df.Customer_Id == target_df.Customer_Id)\
    & (target_df.Flag == 'Y'), 'leftouter'
    ).select(source_df['*'], 
             target_df.Customer_Id.alias('t_Customer_Id'),
             target_df.Name.alias('t_Name'),
             target_df.Age.alias('t_Age'),
             target_df.DOB.alias('t_DOB'),
             target_df.City.alias('t_City'),
             target_df.District.alias('t_District'),
             target_df.State.alias('t_State')
             )
    
join_df.display()

Customer_Id,Name,Age,DOB,City,District,State,t_Customer_Id,t_Name,t_Age,t_DOB,t_City,t_District,t_State
1,Ajimal,23,2001-07-01,Mylode,Kanyakumari,TamilNadu,1.0,Ajimal,23.0,2001-07-01,Gobi,Erode,TamilNadu
2,Jervin,23,2001-10-03,Mylode,Kanyakumari,TamilNadu,2.0,Jervin,23.0,2001-10-03,Mylode,Kanyakumari,TamilNadu
6,Aravindh,22,2001-03-31,TM Palayam,Coimbatore,TamilNadu,,,,,,,
7,Pavithran,24,2000-11-20,Serupanacheri,Chennai,TamilNadu,,,,,,,


In [0]:
filter_df = join_df.filter(xxhash64(join_df.Name, join_df.Age, join_df.DOB, join_df.City, join_df.District, join_df.State)
                           != xxhash64(join_df.t_Name, join_df.t_Age, join_df.t_DOB, join_df.t_City, join_df.t_District, join_df.t_State))

filter_df.display()

Customer_Id,Name,Age,DOB,City,District,State,t_Customer_Id,t_Name,t_Age,t_DOB,t_City,t_District,t_State
1,Ajimal,23,2001-07-01,Mylode,Kanyakumari,TamilNadu,1.0,Ajimal,23.0,2001-07-01,Gobi,Erode,TamilNadu
6,Aravindh,22,2001-03-31,TM Palayam,Coimbatore,TamilNadu,,,,,,,
7,Pavithran,24,2000-11-20,Serupanacheri,Chennai,TamilNadu,,,,,,,


In [0]:
merge_df = filter_df.withColumn('Merge_Key', filter_df['Customer_Id'])
merge_df.display()

Customer_Id,Name,Age,DOB,City,District,State,t_Customer_Id,t_Name,t_Age,t_DOB,t_City,t_District,t_State,Merge_Key
1,Ajimal,23,2001-07-01,Mylode,Kanyakumari,TamilNadu,1.0,Ajimal,23.0,2001-07-01,Gobi,Erode,TamilNadu,1
6,Aravindh,22,2001-03-31,TM Palayam,Coimbatore,TamilNadu,,,,,,,,6
7,Pavithran,24,2000-11-20,Serupanacheri,Chennai,TamilNadu,,,,,,,,7


In [0]:
dummy_df = filter_df.filter('t_Customer_Id is not null').withColumn('Merge_Key', lit('None'))
dummy_df.display()

Customer_Id,Name,Age,DOB,City,District,State,t_Customer_Id,t_Name,t_Age,t_DOB,t_City,t_District,t_State,Dummy_Key
1,Ajimal,23,2001-07-01,Mylode,Kanyakumari,TamilNadu,1,Ajimal,23,2001-07-01,Gobi,Erode,TamilNadu,


In [0]:
scd_df = merge_df.union(dummy_df)
scd_df.display()

Customer_Id,Name,Age,DOB,City,District,State,t_Customer_Id,t_Name,t_Age,t_DOB,t_City,t_District,t_State,Merge_Key
1,Ajimal,23,2001-07-01,Mylode,Kanyakumari,TamilNadu,1.0,Ajimal,23.0,2001-07-01,Gobi,Erode,TamilNadu,1.0
6,Aravindh,22,2001-03-31,TM Palayam,Coimbatore,TamilNadu,,,,,,,,6.0
7,Pavithran,24,2000-11-20,Serupanacheri,Chennai,TamilNadu,,,,,,,,7.0
1,Ajimal,23,2001-07-01,Mylode,Kanyakumari,TamilNadu,1.0,Ajimal,23.0,2001-07-01,Gobi,Erode,TamilNadu,


In [0]:
target_table.alias('target').merge(
    source = scd_df.alias('source'),
    condition = "target.Customer_Id = source.Merge_Key and target.Flag = 'Y'"
).whenMatchedUpdate(
    set = {
        'Flag' : lit('N'),
        'End_Date' : 'current_date'
    }
).whenNotMatchedInsert(
    values = {
        'Customer_Id' : 'source.Customer_Id',
        'Name' : 'source.Name',
        'Age' : 'source.Age',
        'DOB' : 'source.DOB',
        'City' : 'source.City',
        'District' : 'source.District',
        'State' : 'source.State',
        'Flag' : lit('Y'),
        'Start_Date' : 'current_date',
        'End_Date' : lit('9999-12-31')
    }
).execute()

In [0]:
spark.sql('select * from customer order by Customer_Id').display()

Customer_Id,Name,Age,DOB,City,District,State,Flag,Start_Date,End_Date
1,Ajimal,23,2001-07-01,Gobi,Erode,TamilNadu,N,2024-08-10T05:04:06.372+0000,2024-08-10T00:00:00.000+0000
1,Ajimal,23,2001-07-01,Mylode,Kanyakumari,TamilNadu,Y,2024-08-10T00:00:00.000+0000,9999-12-31T00:00:00.000+0000
2,Jervin,23,2001-10-03,Mylode,Kanyakumari,TamilNadu,Y,2024-08-10T05:04:06.372+0000,9999-12-31T00:00:00.000+0000
3,Bergin,24,2000-01-20,Pollachi,Coimbatore,TamilNadu,Y,2024-08-10T05:04:06.372+0000,9999-12-31T00:00:00.000+0000
4,Nishanth,24,2000-11-25,Tambaram,Chennai,TamilNadu,Y,2024-08-10T05:04:06.372+0000,9999-12-31T00:00:00.000+0000
5,Vasanth,24,2001-03-14,Poonamalle,Chennai,TamilNadu,Y,2024-08-10T05:04:06.372+0000,9999-12-31T00:00:00.000+0000
6,Aravindh,22,2001-03-31,TM Palayam,Coimbatore,TamilNadu,Y,2024-08-10T00:00:00.000+0000,9999-12-31T00:00:00.000+0000
7,Pavithran,24,2000-11-20,Serupanacheri,Chennai,TamilNadu,Y,2024-08-10T00:00:00.000+0000,9999-12-31T00:00:00.000+0000
