In [0]:
#Import Session
from pyspark.sql import SparkSession

In [0]:
spark.sql('drop table customer2')

Out[61]: DataFrame[]

In [0]:
%sql

drop table if exists customer2;

create table customer2(
  Id int,
  Name varchar(50),
  Age int,
  DOB date,
  City varchar(50),
  District varchar(50),
  State varchar(50),
  Flag varchar(50),
  Start_Date timestamp,
  End_Date timestamp
) using delta
location '/delta/customer2'

In [0]:
spark.sql('delete from customer2')

Out[63]: DataFrame[num_affected_rows: bigint]

In [0]:
%sql

INSERT INTO customer2 (Id, Name, Age, DOB, City, District, State, Flag, Start_Date, End_Date) VALUES
(1, 'Ajimal', 23, '2001-07-01', 'Gobi', 'Erode', 'TamilNadu', 'Y', current_timestamp(), '9999-12-31'),
(2, 'Jervin', 23, '2001-10-03', 'Mylode', 'Kanyakumari', 'TamilNadu', 'Y', current_timestamp(), '9999-12-31'),
(3, 'Bergin', 24, '2000-01-20', 'Pollachi', 'Coimbatore', 'TamilNadu', 'Y', current_timestamp(), '9999-12-31'),
(4, 'Nishanth', 24, '2000-11-25', 'Tambaram', 'Chennai', 'TamilNadu', 'Y', current_timestamp(), '9999-12-31'),
(5, 'Vasanth', 24, '2001-03-14', 'Poonamalle', 'Chennai', 'TamilNadu', 'Y', current_timestamp(), '9999-12-31');

num_affected_rows,num_inserted_rows
5,5


In [0]:
spark.sql('select * from customer2').show()

+---+--------+---+----------+----------+-----------+---------+----+--------------------+-------------------+
| Id|    Name|Age|       DOB|      City|   District|    State|Flag|          Start_Date|           End_Date|
+---+--------+---+----------+----------+-----------+---------+----+--------------------+-------------------+
|  1|  Ajimal| 23|2001-07-01|      Gobi|      Erode|TamilNadu|   Y|2024-08-12 04:40:...|9999-12-31 00:00:00|
|  2|  Jervin| 23|2001-10-03|    Mylode|Kanyakumari|TamilNadu|   Y|2024-08-12 04:40:...|9999-12-31 00:00:00|
|  3|  Bergin| 24|2000-01-20|  Pollachi| Coimbatore|TamilNadu|   Y|2024-08-12 04:40:...|9999-12-31 00:00:00|
|  4|Nishanth| 24|2000-11-25|  Tambaram|    Chennai|TamilNadu|   Y|2024-08-12 04:40:...|9999-12-31 00:00:00|
|  5| Vasanth| 24|2001-03-14|Poonamalle|    Chennai|TamilNadu|   Y|2024-08-12 04:40:...|9999-12-31 00:00:00|
+---+--------+---+----------+----------+-----------+---------+----+--------------------+-------------------+



In [0]:
from delta.tables import DeltaTable

# Load the Delta table
target_table = DeltaTable.forPath(spark, '/delta/customer2')

# Convert to DataFrame
target_df = target_table.toDF()

# Display the DataFrame in Databricks
target_df.display()

Id,Name,Age,DOB,City,District,State,Flag,Start_Date,End_Date
1,Ajimal,23,2001-07-01,Gobi,Erode,TamilNadu,Y,2024-08-12T04:40:07.840+0000,9999-12-31T00:00:00.000+0000
2,Jervin,23,2001-10-03,Mylode,Kanyakumari,TamilNadu,Y,2024-08-12T04:40:07.840+0000,9999-12-31T00:00:00.000+0000
3,Bergin,24,2000-01-20,Pollachi,Coimbatore,TamilNadu,Y,2024-08-12T04:40:07.840+0000,9999-12-31T00:00:00.000+0000
4,Nishanth,24,2000-11-25,Tambaram,Chennai,TamilNadu,Y,2024-08-12T04:40:07.840+0000,9999-12-31T00:00:00.000+0000
5,Vasanth,24,2001-03-14,Poonamalle,Chennai,TamilNadu,Y,2024-08-12T04:40:07.840+0000,9999-12-31T00:00:00.000+0000


In [0]:
# Import necessary datatype 
from pyspark.sql.functions import *
from pyspark.sql.types import *
from datetime import *

# Create schema for streaming data
schema = StructType(
    [StructField('Id', IntegerType(), True),
    StructField('Name', StringType(), True),
    StructField('Age', IntegerType(), True),
    StructField('DOB', DateType(), True),
    StructField('City', StringType(), True),
    StructField('District', StringType(), True),
    StructField('State', StringType(), True),
    StructField('Last_Modified', DateType(), True)]
)

# Data
data1 = [
    (1, 'Ajimal', 23, date(2001,7,1), 'Mylode', 'Kanyakumari', 'TamilNadu', date(2024,8,11)),
    (2, 'Jervin', 23, date(2001,10,3), 'Mylode', 'Kanyakumari', 'TamilNadu', date(2024,8,11)),
    (6, 'Aravindh', 22, date(2001,3,31), 'TM Palayam', 'Coimbatore', 'TamilNadu', date(2024,8,11)),
    (7, 'Pavithran', 24, date(2000,11,20), 'Serupanacheri', 'Chennai', 'TamilNadu', date(2024,8,11))
    ]

incoming_1 = spark.createDataFrame(data1, schema)
incoming_1.display()

Id,Name,Age,DOB,City,District,State,Last_Modified
1,Ajimal,23,2001-07-01,Mylode,Kanyakumari,TamilNadu,2024-08-11
2,Jervin,23,2001-10-03,Mylode,Kanyakumari,TamilNadu,2024-08-11
6,Aravindh,22,2001-03-31,TM Palayam,Coimbatore,TamilNadu,2024-08-11
7,Pavithran,24,2000-11-20,Serupanacheri,Chennai,TamilNadu,2024-08-11


In [0]:
# Data
data2 = [
    (1, 'Ajimal', 22, date(2001,7,1), 'Mylode', 'Kanyakumari', 'TamilNadu', date(2024,8,12)),
    (2, 'Jervin', 20, date(2001,10,3), 'Mylode', 'Kanyakumari', 'TamilNadu', date(2024,8,12)),
    (8, 'Nitish', 22, date(2001,5,31), 'Tirunelveli', 'Tirunelveli', 'TamilNadu', date(2024,8,12))
    ]

incoming_2 = spark.createDataFrame(data2, schema)
incoming_2.display()

Id,Name,Age,DOB,City,District,State,Last_Modified
1,Ajimal,22,2001-07-01,Mylode,Kanyakumari,TamilNadu,2024-08-12
2,Jervin,20,2001-10-03,Mylode,Kanyakumari,TamilNadu,2024-08-12
8,Nitish,22,2001-05-31,Tirunelveli,Tirunelveli,TamilNadu,2024-08-12


In [0]:
# Data
data3 = [
    (8, 'Nitish', 23, date(2001,5,31), 'Tirunelveli', 'Tirunelveli', 'TamilNadu', date(2024,8,13)),
    (9, 'Abishake', 20, date(2003,7,1), 'Mylode', 'Kanyakumari', 'TamilNadu', date(2024,8,13))
    ]

incoming_3 = spark.createDataFrame(data3, schema)
incoming_3.display()

Id,Name,Age,DOB,City,District,State,Last_Modified
8,Nitish,23,2001-05-31,Tirunelveli,Tirunelveli,TamilNadu,2024-08-13
9,Abishake,20,2003-07-01,Mylode,Kanyakumari,TamilNadu,2024-08-13


In [0]:
# Union to get a single DataFrame
source_df = incoming_1.union(incoming_2).union(incoming_3)
source_df.display()

Id,Name,Age,DOB,City,District,State,Last_Modified
1,Ajimal,23,2001-07-01,Mylode,Kanyakumari,TamilNadu,2024-08-11
2,Jervin,23,2001-10-03,Mylode,Kanyakumari,TamilNadu,2024-08-11
6,Aravindh,22,2001-03-31,TM Palayam,Coimbatore,TamilNadu,2024-08-11
7,Pavithran,24,2000-11-20,Serupanacheri,Chennai,TamilNadu,2024-08-11
1,Ajimal,22,2001-07-01,Mylode,Kanyakumari,TamilNadu,2024-08-12
2,Jervin,20,2001-10-03,Mylode,Kanyakumari,TamilNadu,2024-08-12
8,Nitish,22,2001-05-31,Tirunelveli,Tirunelveli,TamilNadu,2024-08-12
8,Nitish,23,2001-05-31,Tirunelveli,Tirunelveli,TamilNadu,2024-08-13
9,Abishake,20,2003-07-01,Mylode,Kanyakumari,TamilNadu,2024-08-13


In [0]:
# Get distinct Last_Modified dates
DisDF = source_df.dropDuplicates(["Last_Modified"]).select("Last_Modified").collect()
print(DisDF)

[Row(Last_Modified=datetime.date(2024, 8, 11)), Row(Last_Modified=datetime.date(2024, 8, 12)), Row(Last_Modified=datetime.date(2024, 8, 13))]


In [0]:
# Checking 
df = source_df.filter(source_df.Last_Modified == '2024-08-13')
df.show()

+---+--------+---+----------+-----------+-----------+---------+-------------+
| Id|    Name|Age|       DOB|       City|   District|    State|Last_Modified|
+---+--------+---+----------+-----------+-----------+---------+-------------+
|  8|  Nitish| 23|2001-05-31|Tirunelveli|Tirunelveli|TamilNadu|   2024-08-13|
|  9|Abishake| 20|2003-07-01|     Mylode|Kanyakumari|TamilNadu|   2024-08-13|
+---+--------+---+----------+-----------+-----------+---------+-------------+



In [0]:
# Looping through dates
for i in DisDF:
    print(i.Last_Modified)
    df = source_df.filter(source_df.Last_Modified == i.Last_Modified)
    df.show()

2024-08-11
+---+---------+---+----------+-------------+-----------+---------+-------------+
| Id|     Name|Age|       DOB|         City|   District|    State|Last_Modified|
+---+---------+---+----------+-------------+-----------+---------+-------------+
|  1|   Ajimal| 23|2001-07-01|       Mylode|Kanyakumari|TamilNadu|   2024-08-11|
|  2|   Jervin| 23|2001-10-03|       Mylode|Kanyakumari|TamilNadu|   2024-08-11|
|  6| Aravindh| 22|2001-03-31|   TM Palayam| Coimbatore|TamilNadu|   2024-08-11|
|  7|Pavithran| 24|2000-11-20|Serupanacheri|    Chennai|TamilNadu|   2024-08-11|
+---+---------+---+----------+-------------+-----------+---------+-------------+

2024-08-12
+---+------+---+----------+-----------+-----------+---------+-------------+
| Id|  Name|Age|       DOB|       City|   District|    State|Last_Modified|
+---+------+---+----------+-----------+-----------+---------+-------------+
|  1|Ajimal| 22|2001-07-01|     Mylode|Kanyakumari|TamilNadu|   2024-08-12|
|  2|Jervin| 20|2001-10-0

In [0]:
# Get distinct Last_Modified dates
dis_df = source_df.dropDuplicates(["Last_Modified"]).select("Last_Modified").collect()

for i in dis_df:
    # Filter data for the current Last_Modified date
    filter_df = source_df.filter(source_df.Last_Modified == i.Last_Modified)
    
    # Join filtered data with the target table on ID and Flag = 'Y'
    join_df = filter_df.join(target_table.toDF(), 
                                (filter_df.Id == target_table.toDF().Id) & 
                                (target_table.toDF().Flag == "Y"), 
                                "leftouter")\
                          .select(filter_df.Id, 
                                  filter_df.Name, 
                                  filter_df.Age,
                                  filter_df.DOB,
                                  filter_df.City, 
                                  filter_df.District,
                                  filter_df.State, 
                                  target_table.toDF().Id.alias('T_Id'),
                                  target_table.toDF().Name.alias('T_Name'),
                                  target_table.toDF().Age.alias('T_Age'),
                                  target_table.toDF().DOB.alias('T_DOB'),
                                  target_table.toDF().City.alias('T_City'),
                                  target_table.toDF().District.alias('T_District'),
                                  target_table.toDF().State.alias('T_State'))
    
    # Filter the modified joined data using hash value
    filter_df = join_df.filter(xxhash64(join_df.Id, join_df.Name, join_df.Age, join_df.DOB, join_df.City, join_df.District, join_df.State) != 
                               xxhash64(join_df.T_Id, join_df.T_Name, join_df.T_Age, join_df.T_DOB, join_df.T_City, join_df.T_District, join_df.T_State))
    
    # Create a merge key column
    mergekey_df = filter_df.withColumn("Merge_Key", filter_df.Id)
    
    # Create a dummy record for existing records
    dummy_df = mergekey_df.filter("T_Id is not null").withColumn("Merge_Key", lit(None))
    
    # Union the dummy record with the filtered record
    main_df = mergekey_df.union(dummy_df)
    
    # Merge the data frame into the target table
    target_table.alias("t").merge(
        source=main_df.alias("s"),
        condition="t.Id = s.Merge_Key"
    ).whenMatchedUpdate(
        condition="t.Flag = 'Y'",
        set={
            "t.Flag": lit("N"),
            "t.End_Date": current_timestamp()
        }
    ).whenNotMatchedInsert(
        values={
            "Id": "s.Id",
            "Name": "s.Name",
            "Age": "s.Age",
            "DOB": "s.DOB",
            "City": "s.City",
            "District": "s.District",
            "State": "s.State",
            "Flag": lit("Y"),
            "Start_Date": current_timestamp(),
            "End_Date": lit("9999-12-31")
        }
    ).execute()


In [0]:
spark.sql('select * from customer2 order by id').show()

+---+---------+---+----------+-------------+-----------+---------+----+--------------------+--------------------+
| Id|     Name|Age|       DOB|         City|   District|    State|Flag|          Start_Date|            End_Date|
+---+---------+---+----------+-------------+-----------+---------+----+--------------------+--------------------+
|  1|   Ajimal| 23|2001-07-01|         Gobi|      Erode|TamilNadu|   N|2024-08-12 04:40:...|2024-08-12 04:40:...|
|  1|   Ajimal| 23|2001-07-01|       Mylode|Kanyakumari|TamilNadu|   N|2024-08-12 04:40:...|2024-08-12 04:40:...|
|  1|   Ajimal| 22|2001-07-01|       Mylode|Kanyakumari|TamilNadu|   Y|2024-08-12 04:40:...| 9999-12-31 00:00:00|
|  2|   Jervin| 23|2001-10-03|       Mylode|Kanyakumari|TamilNadu|   N|2024-08-12 04:40:...|2024-08-12 04:40:...|
|  2|   Jervin| 20|2001-10-03|       Mylode|Kanyakumari|TamilNadu|   Y|2024-08-12 04:40:...| 9999-12-31 00:00:00|
|  3|   Bergin| 24|2000-01-20|     Pollachi| Coimbatore|TamilNadu|   Y|2024-08-12 04:40:

In [0]:
ls  -l /tmp #for finding tmp folder