# Incremantal Refresh on Hive in SCD type 2 - Union Method

In [0]:
#Import session
from pyspark.sql import SparkSession

In [0]:
#Enable Hive
spark = SparkSession.builder.appName('Hive').enableHiveSupport().getOrCreate()

In [0]:
#Importing necessary datatypes and functions
from pyspark.sql.functions import *
from pyspark.sql.types import *
from datetime import *

In [0]:
#Create Schema
schema = StructType(
    [
        StructField('Id', IntegerType(), False),
        StructField('Name', StringType(), True),
        StructField('Age', IntegerType(), True),
        StructField('DOB', DateType(), True),
        StructField('City', StringType(), True),
        StructField('District', StringType(), True),
        StructField('State', StringType(), True),
        StructField('Flag', StringType(), True),
        StructField('Start_Date', TimestampType(), True),
        StructField('End_Date', TimestampType(), True)
    ]
)

In [0]:
#Create a main table
main_table = spark.createDataFrame([], schema = schema)
main_table.show()

+---+----+---+---+----+--------+-----+----+----------+--------+
| Id|Name|Age|DOB|City|District|State|Flag|Start_Date|End_Date|
+---+----+---+---+----+--------+-----+----+----------+--------+
+---+----+---+---+----+--------+-----+----+----------+--------+



In [0]:
#Create a database
spark.sql('create database if not exists db')

#Insert the dataframe to the table
main_table.write.mode('overwrite').saveAsTable('db.main')

#display the tabel
spark.sql('select * from db.main').show()

+---+----+---+---+----+--------+-----+----+----------+--------+
| Id|Name|Age|DOB|City|District|State|Flag|Start_Date|End_Date|
+---+----+---+---+----+--------+-----+----+----------+--------+
+---+----+---+---+----+--------+-----+----+----------+--------+



In [0]:
#Create new data for 1st type inserting
n_schema = StructType(
    [
        StructField('Id', IntegerType(), False),
        StructField('Name', StringType(), True),
        StructField('Age', IntegerType(), True),
        StructField('DOB', DateType(), True),
        StructField('City', StringType(), True),
        StructField('District', StringType(), True),
        StructField('State', StringType(), True),
    ]
)

data = [
    (1, 'Ajimal', 23, date(2001,7,1), 'Gobi', 'Erode', 'TamilNadu'),
    (2, 'Jervin', 23, date(2001,10,3), 'Mylode', 'Kanyakumari', 'TamilNadu'),
    (3, 'Bergin', 24, date(2000,1,20), 'Pollachi', 'Coimbatore', 'TamilNadu'),
    (4, 'Nishanth', 24, date(2000,11,25), 'Tambaram', 'Chennai', 'TamilNadu'),
    (5, 'Vasanth', 24, date(2001,3,14), 'Poonamalle', 'Chennai', 'TamilNadu')
    ]

#Create dataframe
df = spark.createDataFrame(data = data, schema = n_schema)
df.display()

Id,Name,Age,DOB,City,District,State
1,Ajimal,23,2001-07-01,Gobi,Erode,TamilNadu
2,Jervin,23,2001-10-03,Mylode,Kanyakumari,TamilNadu
3,Bergin,24,2000-01-20,Pollachi,Coimbatore,TamilNadu
4,Nishanth,24,2000-11-25,Tambaram,Chennai,TamilNadu
5,Vasanth,24,2001-03-14,Poonamalle,Chennai,TamilNadu


In [0]:
#Adding columns for history reference
df = df.withColumn('Flag', lit('Y')).\
        withColumn('Start_Date', lit(current_timestamp())).\
        withColumn('End_Date', lit(datetime(9999, 12, 31)))

#display
df.display()

Id,Name,Age,DOB,City,District,State,Flag,Start_Date,End_Date
1,Ajimal,23,2001-07-01,Gobi,Erode,TamilNadu,Y,2024-08-08T08:45:14.742+0000,9999-12-31T00:00:00.000+0000
2,Jervin,23,2001-10-03,Mylode,Kanyakumari,TamilNadu,Y,2024-08-08T08:45:14.742+0000,9999-12-31T00:00:00.000+0000
3,Bergin,24,2000-01-20,Pollachi,Coimbatore,TamilNadu,Y,2024-08-08T08:45:14.742+0000,9999-12-31T00:00:00.000+0000
4,Nishanth,24,2000-11-25,Tambaram,Chennai,TamilNadu,Y,2024-08-08T08:45:14.742+0000,9999-12-31T00:00:00.000+0000
5,Vasanth,24,2001-03-14,Poonamalle,Chennai,TamilNadu,Y,2024-08-08T08:45:14.742+0000,9999-12-31T00:00:00.000+0000


In [0]:
#Writing the new datain the database
df.write.mode('overwrite').saveAsTable('db.main')

#Display the Db
spark.sql('select * from db.main').display()

Id,Name,Age,DOB,City,District,State,Flag,Start_Date,End_Date
5,Vasanth,24,2001-03-14,Poonamalle,Chennai,TamilNadu,Y,2024-08-08T08:45:16.019+0000,9999-12-31T00:00:00.000+0000
3,Bergin,24,2000-01-20,Pollachi,Coimbatore,TamilNadu,Y,2024-08-08T08:45:16.019+0000,9999-12-31T00:00:00.000+0000
2,Jervin,23,2001-10-03,Mylode,Kanyakumari,TamilNadu,Y,2024-08-08T08:45:16.019+0000,9999-12-31T00:00:00.000+0000
4,Nishanth,24,2000-11-25,Tambaram,Chennai,TamilNadu,Y,2024-08-08T08:45:16.019+0000,9999-12-31T00:00:00.000+0000
1,Ajimal,23,2001-07-01,Gobi,Erode,TamilNadu,Y,2024-08-08T08:45:16.019+0000,9999-12-31T00:00:00.000+0000


In [0]:
#New data to perform SCD type 2
new_data = [
    (1, 'Ajimal', 23, date(2001,7,1), 'Mylode', 'Kanyakumari', 'TamilNadu'),
    (2, 'Jervin', 23, date(2001,10,3), 'Mylode', 'Kanyakumari', 'TamilNadu'),
    (6, 'Aravindh', 22, date(2001,3,31), 'TM Palayam', 'Coimbatore', 'TamilNadu'),
    (7, 'Pavithran', 24, date(2000,11,20), 'Serupanacheri', 'Chennai', 'TamilNadu')
    ]

stream_data = spark.createDataFrame(new_data, n_schema)

#adding the default column to the stream data
stream_data = stream_data.withColumn('Flag', lit('Y')).\
                            withColumn('Start_Date', lit(current_date())).\
                                withColumn('End_Date', lit(datetime(9999, 12, 31)))

stream_data.display()

Id,Name,Age,DOB,City,District,State,Flag,Start_Date,End_Date
1,Ajimal,23,2001-07-01,Mylode,Kanyakumari,TamilNadu,Y,2024-08-08,9999-12-31T00:00:00.000+0000
2,Jervin,23,2001-10-03,Mylode,Kanyakumari,TamilNadu,Y,2024-08-08,9999-12-31T00:00:00.000+0000
6,Aravindh,22,2001-03-31,TM Palayam,Coimbatore,TamilNadu,Y,2024-08-08,9999-12-31T00:00:00.000+0000
7,Pavithran,24,2000-11-20,Serupanacheri,Chennai,TamilNadu,Y,2024-08-08,9999-12-31T00:00:00.000+0000


In [0]:
main = spark.sql('select * from db.main')
main.display()

Id,Name,Age,DOB,City,District,State,Flag,Start_Date,End_Date
5,Vasanth,24,2001-03-14,Poonamalle,Chennai,TamilNadu,Y,2024-08-08T08:45:16.019+0000,9999-12-31T00:00:00.000+0000
3,Bergin,24,2000-01-20,Pollachi,Coimbatore,TamilNadu,Y,2024-08-08T08:45:16.019+0000,9999-12-31T00:00:00.000+0000
2,Jervin,23,2001-10-03,Mylode,Kanyakumari,TamilNadu,Y,2024-08-08T08:45:16.019+0000,9999-12-31T00:00:00.000+0000
4,Nishanth,24,2000-11-25,Tambaram,Chennai,TamilNadu,Y,2024-08-08T08:45:16.019+0000,9999-12-31T00:00:00.000+0000
1,Ajimal,23,2001-07-01,Gobi,Erode,TamilNadu,Y,2024-08-08T08:45:16.019+0000,9999-12-31T00:00:00.000+0000


In [0]:
#joining the stream data to get the data that are present in both
main_stream_join = main.join(
    stream_data.alias('stream'), 'Id', 'left'
)

main_stream_join.display()

Id,Name,Age,DOB,City,District,State,Flag,Start_Date,End_Date,Name.1,Age.1,DOB.1,City.1,District.1,State.1,Flag.1,Start_Date.1,End_Date.1
5,Vasanth,24,2001-03-14,Poonamalle,Chennai,TamilNadu,Y,2024-08-08T08:45:16.019+0000,9999-12-31T00:00:00.000+0000,,,,,,,,,
3,Bergin,24,2000-01-20,Pollachi,Coimbatore,TamilNadu,Y,2024-08-08T08:45:16.019+0000,9999-12-31T00:00:00.000+0000,,,,,,,,,
2,Jervin,23,2001-10-03,Mylode,Kanyakumari,TamilNadu,Y,2024-08-08T08:45:16.019+0000,9999-12-31T00:00:00.000+0000,Jervin,23.0,2001-10-03,Mylode,Kanyakumari,TamilNadu,Y,2024-08-08,9999-12-31T00:00:00.000+0000
4,Nishanth,24,2000-11-25,Tambaram,Chennai,TamilNadu,Y,2024-08-08T08:45:16.019+0000,9999-12-31T00:00:00.000+0000,,,,,,,,,
1,Ajimal,23,2001-07-01,Gobi,Erode,TamilNadu,Y,2024-08-08T08:45:16.019+0000,9999-12-31T00:00:00.000+0000,Ajimal,23.0,2001-07-01,Mylode,Kanyakumari,TamilNadu,Y,2024-08-08,9999-12-31T00:00:00.000+0000


In [0]:
#joining the stream data to get the data that are not present in both
main_stream_join = main.join(
    stream_data.alias('stream'), 'Id', 'inner'
).where(
    (main.Name != stream_data.Name) | 
    (main.Age != stream_data.Age) | 
    (main.DOB != stream_data.DOB) | 
    (main.City != stream_data.City) | 
    (main.District != stream_data.District) | 
    (main.State != stream_data.State)
).select(
    'Id', 'main.Name', 'main.Age', 'main.DOB', 'main.City', 'main.District', 'main.State', 'main.Flag', 'main.Start_Date',
    lit(current_timestamp()).alias('End_Date')
).withColumn('Flag', lit('N'))

main_stream_join.display()

Id,Name,Age,DOB,City,District,State,Flag,Start_Date,End_Date
1,Ajimal,23,2001-07-01,Gobi,Erode,TamilNadu,N,2024-08-08T08:45:16.019+0000,2024-08-08T09:39:31.102+0000


In [0]:
# Identify records that exist in the main table
missed_record = main.join(
    main_stream_join, 'Id', 'left_anti'
)

missed_record.display()

Id,Name,Age,DOB,City,District,State,Flag,Start_Date,End_Date
5,Vasanth,24,2001-03-14,Poonamalle,Chennai,TamilNadu,Y,2024-08-08T08:45:16.019+0000,9999-12-31T00:00:00.000+0000
3,Bergin,24,2000-01-20,Pollachi,Coimbatore,TamilNadu,Y,2024-08-08T08:45:16.019+0000,9999-12-31T00:00:00.000+0000
2,Jervin,23,2001-10-03,Mylode,Kanyakumari,TamilNadu,Y,2024-08-08T08:45:16.019+0000,9999-12-31T00:00:00.000+0000
4,Nishanth,24,2000-11-25,Tambaram,Chennai,TamilNadu,Y,2024-08-08T08:45:16.019+0000,9999-12-31T00:00:00.000+0000


In [0]:
#Union to get the missed data with the new data
updated_main = missed_record.union(main_stream_join)

updated_main.display()

Id,Name,Age,DOB,City,District,State,Flag,Start_Date,End_Date
5,Vasanth,24,2001-03-14,Poonamalle,Chennai,TamilNadu,Y,2024-08-08T08:45:16.019+0000,9999-12-31T00:00:00.000+0000
3,Bergin,24,2000-01-20,Pollachi,Coimbatore,TamilNadu,Y,2024-08-08T08:45:16.019+0000,9999-12-31T00:00:00.000+0000
2,Jervin,23,2001-10-03,Mylode,Kanyakumari,TamilNadu,Y,2024-08-08T08:45:16.019+0000,9999-12-31T00:00:00.000+0000
4,Nishanth,24,2000-11-25,Tambaram,Chennai,TamilNadu,Y,2024-08-08T08:45:16.019+0000,9999-12-31T00:00:00.000+0000
1,Ajimal,23,2001-07-01,Gobi,Erode,TamilNadu,N,2024-08-08T08:45:16.019+0000,2024-08-08T09:42:15.083+0000


In [0]:
# Identify new records that don't exist in the main table
new_records = stream_data.join(
    main.alias('main'), 
    (stream_data.Id == main.Id) & 
    (stream_data.Name == main.Name) & 
    (stream_data.Age == main.Age) & 
    (stream_data.DOB == main.DOB) & 
    (stream_data.City == main.City) & 
    (stream_data.District == main.District) & 
    (stream_data.State == main.State), 
    'left_anti'
)

new_records.display()

Id,Name,Age,DOB,City,District,State,Flag,Start_Date,End_Date
1,Ajimal,23,2001-07-01,Mylode,Kanyakumari,TamilNadu,Y,2024-08-08,9999-12-31T00:00:00.000+0000
6,Aravindh,22,2001-03-31,TM Palayam,Coimbatore,TamilNadu,Y,2024-08-08,9999-12-31T00:00:00.000+0000
7,Pavithran,24,2000-11-20,Serupanacheri,Chennai,TamilNadu,Y,2024-08-08,9999-12-31T00:00:00.000+0000


In [0]:
#joining all datas|
final_df = updated_main.union(new_records)

final_df.display()

Id,Name,Age,DOB,City,District,State,Flag,Start_Date,End_Date
5,Vasanth,24,2001-03-14,Poonamalle,Chennai,TamilNadu,Y,2024-08-08T08:45:16.019+0000,9999-12-31T00:00:00.000+0000
3,Bergin,24,2000-01-20,Pollachi,Coimbatore,TamilNadu,Y,2024-08-08T08:45:16.019+0000,9999-12-31T00:00:00.000+0000
2,Jervin,23,2001-10-03,Mylode,Kanyakumari,TamilNadu,Y,2024-08-08T08:45:16.019+0000,9999-12-31T00:00:00.000+0000
4,Nishanth,24,2000-11-25,Tambaram,Chennai,TamilNadu,Y,2024-08-08T08:45:16.019+0000,9999-12-31T00:00:00.000+0000
1,Ajimal,23,2001-07-01,Gobi,Erode,TamilNadu,N,2024-08-08T08:45:16.019+0000,2024-08-08T09:43:26.923+0000
1,Ajimal,23,2001-07-01,Mylode,Kanyakumari,TamilNadu,Y,2024-08-08T00:00:00.000+0000,9999-12-31T00:00:00.000+0000
6,Aravindh,22,2001-03-31,TM Palayam,Coimbatore,TamilNadu,Y,2024-08-08T00:00:00.000+0000,9999-12-31T00:00:00.000+0000
7,Pavithran,24,2000-11-20,Serupanacheri,Chennai,TamilNadu,Y,2024-08-08T00:00:00.000+0000,9999-12-31T00:00:00.000+0000
