In [2]:
import findspark
findspark.init()
import pyspark
from delta import *
from pyspark.sql.types import *
from pyspark.sql.functions import *

#  Create a spark session with Delta
builder = pyspark.sql.SparkSession.builder.appName("DeltaTutorial") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

# Create spark context
spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [8]:
demo_db = spark.sql('CREATE DATABASE IF NOT EXISTS f1_demo LOCATION "file:/E:/unused/Udemy/Spark_practice/raw/Delta lake/db"')

In [3]:
# spark.sql('DROP DATABASE f1_demo CASCADE')

In [4]:
results_df = spark.read.json(r'E:\unused\Udemy\Spark_practice\raw\incremental_load_data\raw files\2021-03-28\results.json')
results_df.show(2)

+-------------+--------+----------+---------------+--------------+----+----+------------+------+------+--------+-------------+------------+------+----+--------+--------+-----------+
|constructorId|driverId|fastestLap|fastestLapSpeed|fastestLapTime|grid|laps|milliseconds|number|points|position|positionOrder|positionText|raceId|rank|resultId|statusId|       time|
+-------------+--------+----------+---------------+--------------+----+----+------------+------+------+--------+-------------+------------+------+----+--------+--------+-----------+
|          131|       1|        44|        207.235|      1:34.015|   2|  56|     5523897|    44|    25|       1|            1|           1|  1052|   4|   24966|       1|1:32:03.897|
|            9|     830|        41|        208.984|      1:33.228|   1|  56|     5524642|    33|    18|       2|            2|           2|  1052|   2|   24967|       1|     +0.745|
+-------------+--------+----------+---------------+--------------+----+----+------------+-

https://docs.delta.io/latest/index.html

In [5]:
results_df.write.format('delta').mode('overwrite').saveAsTable('f1_demo.results')

In [6]:
results_df.write.format('delta').mode('overwrite').save('file/result_delta')

In [None]:
spark.sql('CREATE TABLE f1_demo.results_delta_ext USING delta LOCATION "file"')

In [8]:
result_df = spark.read.format('delta').load("file/result_delta")
result_df.show(2)

+-------------+--------+----------+---------------+--------------+----+----+------------+------+------+--------+-------------+------------+------+----+--------+--------+-----------+
|constructorId|driverId|fastestLap|fastestLapSpeed|fastestLapTime|grid|laps|milliseconds|number|points|position|positionOrder|positionText|raceId|rank|resultId|statusId|       time|
+-------------+--------+----------+---------------+--------------+----+----+------------+------+------+--------+-------------+------------+------+----+--------+--------+-----------+
|          131|       1|        44|        207.235|      1:34.015|   2|  56|     5523897|    44|    25|       1|            1|           1|  1052|   4|   24966|       1|1:32:03.897|
|            9|     830|        41|        208.984|      1:33.228|   1|  56|     5524642|    33|    18|       2|            2|           2|  1052|   2|   24967|       1|     +0.745|
+-------------+--------+----------+---------------+--------------+----+----+------------+-

In [9]:
results_df.write.format('delta').mode('overwrite').partitionBy('constructorId').saveAsTable('f1_demo.results_partition')

In [10]:
# spark.sql('SHOW PARTITIONS f1_demo.results_partition').show()
result_df = spark.sql('SELECT * FROM f1_demo.results_partition')
result_df.show(2)

+-------------+--------+----------+---------------+--------------+----+----+------------+------+------+--------+-------------+------------+------+----+--------+--------+-----------+
|constructorId|driverId|fastestLap|fastestLapSpeed|fastestLapTime|grid|laps|milliseconds|number|points|position|positionOrder|positionText|raceId|rank|resultId|statusId|       time|
+-------------+--------+----------+---------------+--------------+----+----+------------+------+------+--------+-------------+------------+------+----+--------+--------+-----------+
|          131|       1|        44|        207.235|      1:34.015|   2|  56|     5523897|    44|    25|       1|            1|           1|  1052|   4|   24966|       1|1:32:03.897|
|          131|     822|        56|        211.566|      1:32.090|   3|  56|     5561280|    77|    16|       3|            3|           3|  1052|   1|   24968|       1|    +37.383|
+-------------+--------+----------+---------------+--------------+----+----+------------+-

## UPDATE

In [11]:
spark.sql('UPDATE f1_demo.results_partition SET points = 11 - position where position<=10').show()

+-----------------+
|num_affected_rows|
+-----------------+
|               10|
+-----------------+



In [12]:
from delta.tables import DeltaTable
deltaTable =  DeltaTable.forPath(spark, "file/result_delta")
deltaTable.update('position <= 10',{'points':'21-position'})
result_df = spark.read.format('delta').load("file/result_delta")
result_df.show(2)

+-------------+--------+----------+---------------+--------------+----+----+------------+------+------+--------+-------------+------------+------+----+--------+--------+-----------+
|constructorId|driverId|fastestLap|fastestLapSpeed|fastestLapTime|grid|laps|milliseconds|number|points|position|positionOrder|positionText|raceId|rank|resultId|statusId|       time|
+-------------+--------+----------+---------------+--------------+----+----+------------+------+------+--------+-------------+------------+------+----+--------+--------+-----------+
|          131|       1|        44|        207.235|      1:34.015|   2|  56|     5523897|    44|    20|       1|            1|           1|  1052|   4|   24966|       1|1:32:03.897|
|            9|     830|        41|        208.984|      1:33.228|   1|  56|     5524642|    33|    19|       2|            2|           2|  1052|   2|   24967|       1|     +0.745|
+-------------+--------+----------+---------------+--------------+----+----+------------+-

# DELETE

In [13]:
spark.sql('DELETE from f1_demo.results_partition WHERE position>10')
result_df = spark.sql('SELECT count(*) FROM f1_demo.results_partition')
result_df.show()

+--------+
|count(1)|
+--------+
|      12|
+--------+



In [14]:
deltaTable.delete("position>10")
result_df = spark.read.format('delta').load("file/result_delta")
result_df.show()

+-------------+--------+----------+---------------+--------------+----+----+------------+------+------+--------+-------------+------------+------+----+--------+--------+-----------+
|constructorId|driverId|fastestLap|fastestLapSpeed|fastestLapTime|grid|laps|milliseconds|number|points|position|positionOrder|positionText|raceId|rank|resultId|statusId|       time|
+-------------+--------+----------+---------------+--------------+----+----+------------+------+------+--------+-------------+------------+------+----+--------+--------+-----------+
|          131|       1|        44|        207.235|      1:34.015|   2|  56|     5523897|    44|    20|       1|            1|           1|  1052|   4|   24966|       1|1:32:03.897|
|            9|     830|        41|        208.984|      1:33.228|   1|  56|     5524642|    33|    19|       2|            2|           2|  1052|   2|   24967|       1|     +0.745|
|          131|     822|        56|        211.566|      1:32.090|   3|  56|     5561280| 

## MERGE/ UPSERT

In [15]:
drivers_day1_df = spark.read \
.option("inferSchema", True) \
.json(r'E:\unused\Udemy\Spark_practice\raw\incremental_load_data\raw files\2021-03-28\drivers.json') \
.filter("driverId <= 10") \
.select("driverId", "dob", "name.forename", "name.surname")

drivers_day1_df.createOrReplaceTempView("drivers_day1")
drivers_day1_df.show()

+--------+----------+---------+----------+
|driverId|       dob| forename|   surname|
+--------+----------+---------+----------+
|       1|1985-01-07|    Lewis|  Hamilton|
|       2|1977-05-10|     Nick|  Heidfeld|
|       3|1985-06-27|     Nico|   Rosberg|
|       4|1981-07-29| Fernando|    Alonso|
|       5|1981-10-19|   Heikki|Kovalainen|
|       6|1985-01-11|   Kazuki|  Nakajima|
|       7|1979-02-28|Sébastien|  Bourdais|
|       8|1979-10-17|     Kimi| Räikkönen|
|       9|1984-12-07|   Robert|    Kubica|
|      10|1982-03-18|     Timo|     Glock|
+--------+----------+---------+----------+



In [16]:
drivers_day2_df = spark.read \
.option("inferSchema", True) \
.json(r'E:\unused\Udemy\Spark_practice\raw\incremental_load_data\raw files\2021-03-28\drivers.json') \
.filter("driverId BETWEEN 6 AND 15") \
.select("driverId", "dob", upper("name.forename").alias("forename"), upper("name.surname").alias("surname"))

drivers_day2_df.createOrReplaceTempView("drivers_day2")
drivers_day2_df.show()

+--------+----------+---------+----------+
|driverId|       dob| forename|   surname|
+--------+----------+---------+----------+
|       6|1985-01-11|   KAZUKI|  NAKAJIMA|
|       7|1979-02-28|SÉBASTIEN|  BOURDAIS|
|       8|1979-10-17|     KIMI| RÄIKKÖNEN|
|       9|1984-12-07|   ROBERT|    KUBICA|
|      10|1982-03-18|     TIMO|     GLOCK|
|      11|1977-01-28|   TAKUMA|      SATO|
|      12|1985-07-25|   NELSON|PIQUET JR.|
|      13|1981-04-25|   FELIPE|     MASSA|
|      14|1971-03-27|    DAVID| COULTHARD|
|      15|1974-07-13|    JARNO|    TRULLI|
+--------+----------+---------+----------+



In [17]:
drivers_day3_df = spark.read \
.option("inferSchema", True) \
.json(r'E:\unused\Udemy\Spark_practice\raw\incremental_load_data\raw files\2021-03-28\drivers.json') \
.filter("driverId BETWEEN 1 AND 5 OR driverId BETWEEN 16 AND 20") \
.select("driverId", "dob", upper("name.forename").alias("forename"), upper("name.surname").alias("surname"))

drivers_day3_df.show()

+--------+----------+---------+----------+
|driverId|       dob| forename|   surname|
+--------+----------+---------+----------+
|       1|1985-01-07|    LEWIS|  HAMILTON|
|       2|1977-05-10|     NICK|  HEIDFELD|
|       3|1985-06-27|     NICO|   ROSBERG|
|       4|1981-07-29| FERNANDO|    ALONSO|
|       5|1981-10-19|   HEIKKI|KOVALAINEN|
|      16|1983-01-11|   ADRIAN|     SUTIL|
|      17|1976-08-27|     MARK|    WEBBER|
|      18|1980-01-19|   JENSON|    BUTTON|
|      19|1979-04-18|  ANTHONY|  DAVIDSON|
|      20|1987-07-03|SEBASTIAN|    VETTEL|
+--------+----------+---------+----------+



In [23]:
spark.sql('CREATE TABLE IF NOT EXISTS f1_demo.drivers_merge ( driverId INT, dob DATE, forename STRING,  surname STRING, createdDate DATE,  updatedDate DATE ) USING DELTA')

DataFrame[]

In [24]:
spark.sql('MERGE INTO f1_demo.drivers_merge tgt USING drivers_day1 upd ON tgt.driverId=upd.driverId WHEN MATCHED THEN UPDATE SET tgt.dob = upd.dob,tgt.forename = upd.forename,tgt.surname = upd.surname,tgt.updatedDate = current_timestamp WHEN NOT MATCHED THEN INSERT (driverId, dob, forename,surname,createdDate ) VALUES (driverId, dob, forename,surname, current_timestamp)').show()

+-----------------+----------------+----------------+-----------------+
|num_affected_rows|num_updated_rows|num_deleted_rows|num_inserted_rows|
+-----------------+----------------+----------------+-----------------+
|               10|               0|               0|               10|
+-----------------+----------------+----------------+-----------------+



`MERGE INTO target
USING source
ON source.key = target.key
WHEN MATCHED THEN
  UPDATE SET *
WHEN NOT MATCHED THEN
  INSERT *
WHEN NOT MATCHED BY SOURCE
  DELETE`  
 WHEN NOT MATCHED BY SOURCE clause to UPDATE or DELETE records in the target table that do not have corresponding records in the source table. We recommend adding an optional conditional clause to avoid fully rewriting the target table.

In [25]:
spark.sql('SELECT * FROM f1_demo.drivers_merge').show()

+--------+----------+---------+----------+-----------+-----------+
|driverId|       dob| forename|   surname|createdDate|updatedDate|
+--------+----------+---------+----------+-----------+-----------+
|       7|1979-02-28|Sébastien|  Bourdais| 2024-02-10|       null|
|       6|1985-01-11|   Kazuki|  Nakajima| 2024-02-10|       null|
|       9|1984-12-07|   Robert|    Kubica| 2024-02-10|       null|
|       5|1981-10-19|   Heikki|Kovalainen| 2024-02-10|       null|
|       1|1985-01-07|    Lewis|  Hamilton| 2024-02-10|       null|
|      10|1982-03-18|     Timo|     Glock| 2024-02-10|       null|
|       3|1985-06-27|     Nico|   Rosberg| 2024-02-10|       null|
|       8|1979-10-17|     Kimi| Räikkönen| 2024-02-10|       null|
|       2|1977-05-10|     Nick|  Heidfeld| 2024-02-10|       null|
|       4|1981-07-29| Fernando|    Alonso| 2024-02-10|       null|
+--------+----------+---------+----------+-----------+-----------+



In [26]:
spark.sql('MERGE INTO f1_demo.drivers_merge tgt USING drivers_day2 upd ON tgt.driverId=upd.driverId WHEN MATCHED THEN UPDATE SET tgt.dob = upd.dob,tgt.forename = upd.forename,tgt.surname = upd.surname,tgt.updatedDate = current_timestamp WHEN NOT MATCHED THEN INSERT (driverId, dob, forename,surname,createdDate ) VALUES (driverId, dob, forename,surname, current_timestamp)').show()

+-----------------+----------------+----------------+-----------------+
|num_affected_rows|num_updated_rows|num_deleted_rows|num_inserted_rows|
+-----------------+----------------+----------------+-----------------+
|               10|               5|               0|                5|
+-----------------+----------------+----------------+-----------------+



In [27]:
spark.sql('SELECT * FROM f1_demo.drivers_merge').show()

+--------+----------+---------+----------+-----------+-----------+
|driverId|       dob| forename|   surname|createdDate|updatedDate|
+--------+----------+---------+----------+-----------+-----------+
|       1|1985-01-07|    Lewis|  Hamilton| 2024-02-10|       null|
|       2|1977-05-10|     Nick|  Heidfeld| 2024-02-10|       null|
|       3|1985-06-27|     Nico|   Rosberg| 2024-02-10|       null|
|       4|1981-07-29| Fernando|    Alonso| 2024-02-10|       null|
|       5|1981-10-19|   Heikki|Kovalainen| 2024-02-10|       null|
|       6|1985-01-11|   KAZUKI|  NAKAJIMA| 2024-02-10| 2024-02-10|
|       7|1979-02-28|SÉBASTIEN|  BOURDAIS| 2024-02-10| 2024-02-10|
|       8|1979-10-17|     KIMI| RÄIKKÖNEN| 2024-02-10| 2024-02-10|
|       9|1984-12-07|   ROBERT|    KUBICA| 2024-02-10| 2024-02-10|
|      10|1982-03-18|     TIMO|     GLOCK| 2024-02-10| 2024-02-10|
|      11|1977-01-28|   TAKUMA|      SATO| 2024-02-10|       null|
|      12|1985-07-25|   NELSON|PIQUET JR.| 2024-02-10|       n

In [28]:
from delta.tables import DeltaTable

deltaTablePeople = DeltaTable.forPath(spark, r"E:/unused/Udemy/Spark_practice/raw/Delta lake/db/drivers_merge")

deltaTablePeople.alias('tgt') \
  .merge(
    drivers_day3_df.alias('upd'),
    "tgt.driverId = upd.driverId"
  ) \
  .whenMatchedUpdate(set =
    {
      "dob" : "upd.dob",
      "forename" : "upd.forename",
      "surname" : "upd.surname",
      "updatedDate": "current_timestamp()"
    }
  ) \
  .whenNotMatchedInsert(values =
    {
      "driverId": "upd.driverId",
      "dob": "upd.dob",
      "forename" : "upd.forename", 
      "surname" : "upd.surname", 
      "createdDate": "current_timestamp()"
    }
  ) \
  .execute()

In [29]:
spark.sql('SELECT * FROM f1_demo.drivers_merge').show()

+--------+----------+---------+----------+-----------+-----------+
|driverId|       dob| forename|   surname|createdDate|updatedDate|
+--------+----------+---------+----------+-----------+-----------+
|       1|1985-01-07|    LEWIS|  HAMILTON| 2024-02-10| 2024-02-10|
|       2|1977-05-10|     NICK|  HEIDFELD| 2024-02-10| 2024-02-10|
|       3|1985-06-27|     NICO|   ROSBERG| 2024-02-10| 2024-02-10|
|       4|1981-07-29| FERNANDO|    ALONSO| 2024-02-10| 2024-02-10|
|       5|1981-10-19|   HEIKKI|KOVALAINEN| 2024-02-10| 2024-02-10|
|       6|1985-01-11|   KAZUKI|  NAKAJIMA| 2024-02-10| 2024-02-10|
|       7|1979-02-28|SÉBASTIEN|  BOURDAIS| 2024-02-10| 2024-02-10|
|       8|1979-10-17|     KIMI| RÄIKKÖNEN| 2024-02-10| 2024-02-10|
|       9|1984-12-07|   ROBERT|    KUBICA| 2024-02-10| 2024-02-10|
|      10|1982-03-18|     TIMO|     GLOCK| 2024-02-10| 2024-02-10|
|      11|1977-01-28|   TAKUMA|      SATO| 2024-02-10|       null|
|      12|1985-07-25|   NELSON|PIQUET JR.| 2024-02-10|       n

## History Of Table

In [30]:
spark.sql('DESCRIBE HISTORY f1_demo.drivers_merge').show(truncate=False)

+-------+-----------------------+------+--------+------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----+--------+---------+-----------+--------------+-------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------+-----------------------------------+
|version|timestamp              |userId|userName|operation   |operationParameters                        

In [31]:
spark.sql('SELECT * FROM f1_demo.drivers_merge VERSION AS OF 2').show()

+--------+----------+---------+----------+-----------+-----------+
|driverId|       dob| forename|   surname|createdDate|updatedDate|
+--------+----------+---------+----------+-----------+-----------+
|       1|1985-01-07|    Lewis|  Hamilton| 2024-02-10|       null|
|       2|1977-05-10|     Nick|  Heidfeld| 2024-02-10|       null|
|       3|1985-06-27|     Nico|   Rosberg| 2024-02-10|       null|
|       4|1981-07-29| Fernando|    Alonso| 2024-02-10|       null|
|       5|1981-10-19|   Heikki|Kovalainen| 2024-02-10|       null|
|       6|1985-01-11|   KAZUKI|  NAKAJIMA| 2024-02-10| 2024-02-10|
|       7|1979-02-28|SÉBASTIEN|  BOURDAIS| 2024-02-10| 2024-02-10|
|       8|1979-10-17|     KIMI| RÄIKKÖNEN| 2024-02-10| 2024-02-10|
|       9|1984-12-07|   ROBERT|    KUBICA| 2024-02-10| 2024-02-10|
|      10|1982-03-18|     TIMO|     GLOCK| 2024-02-10| 2024-02-10|
|      11|1977-01-28|   TAKUMA|      SATO| 2024-02-10|       null|
|      12|1985-07-25|   NELSON|PIQUET JR.| 2024-02-10|       n

In [33]:
spark.sql('SELECT * FROM f1_demo.drivers_merge TIMESTAMP AS OF "2024-02-10 21:05:13.505"').show()

+--------+----------+---------+----------+-----------+-----------+
|driverId|       dob| forename|   surname|createdDate|updatedDate|
+--------+----------+---------+----------+-----------+-----------+
|       1|1985-01-07|    Lewis|  Hamilton| 2024-02-10|       null|
|       2|1977-05-10|     Nick|  Heidfeld| 2024-02-10|       null|
|       3|1985-06-27|     Nico|   Rosberg| 2024-02-10|       null|
|       4|1981-07-29| Fernando|    Alonso| 2024-02-10|       null|
|       5|1981-10-19|   Heikki|Kovalainen| 2024-02-10|       null|
|       6|1985-01-11|   KAZUKI|  NAKAJIMA| 2024-02-10| 2024-02-10|
|       7|1979-02-28|SÉBASTIEN|  BOURDAIS| 2024-02-10| 2024-02-10|
|       8|1979-10-17|     KIMI| RÄIKKÖNEN| 2024-02-10| 2024-02-10|
|       9|1984-12-07|   ROBERT|    KUBICA| 2024-02-10| 2024-02-10|
|      10|1982-03-18|     TIMO|     GLOCK| 2024-02-10| 2024-02-10|
|      11|1977-01-28|   TAKUMA|      SATO| 2024-02-10|       null|
|      12|1985-07-25|   NELSON|PIQUET JR.| 2024-02-10|       n

In [34]:
df = spark.read.format("delta").\
           option("timestampAsOf", "2024-02-10 21:05:13.505").\
           load(r"E:/unused/Udemy/Spark_practice/raw/Delta lake/db/drivers_merge").show()
# instead of timestampAsOf we can give versionAsOf

+--------+----------+---------+----------+-----------+-----------+
|driverId|       dob| forename|   surname|createdDate|updatedDate|
+--------+----------+---------+----------+-----------+-----------+
|       1|1985-01-07|    Lewis|  Hamilton| 2024-02-10|       null|
|       2|1977-05-10|     Nick|  Heidfeld| 2024-02-10|       null|
|       3|1985-06-27|     Nico|   Rosberg| 2024-02-10|       null|
|       4|1981-07-29| Fernando|    Alonso| 2024-02-10|       null|
|       5|1981-10-19|   Heikki|Kovalainen| 2024-02-10|       null|
|       6|1985-01-11|   KAZUKI|  NAKAJIMA| 2024-02-10| 2024-02-10|
|       7|1979-02-28|SÉBASTIEN|  BOURDAIS| 2024-02-10| 2024-02-10|
|       8|1979-10-17|     KIMI| RÄIKKÖNEN| 2024-02-10| 2024-02-10|
|       9|1984-12-07|   ROBERT|    KUBICA| 2024-02-10| 2024-02-10|
|      10|1982-03-18|     TIMO|     GLOCK| 2024-02-10| 2024-02-10|
|      11|1977-01-28|   TAKUMA|      SATO| 2024-02-10|       null|
|      12|1985-07-25|   NELSON|PIQUET JR.| 2024-02-10|       n

## VACCUM

You can remove files no longer referenced by a Delta table and are older than the retention threshold by running the vacuum command on the table. vacuum is not triggered automatically. The default retention threshold for the files is 7 days. if you don't run the VACUUM operation periodically on a Delta table, the metadata associated with the table may become large over time.

In [36]:
spark.sql('VACUUM f1_demo.drivers_merge').show()

+--------------------+
|                path|
+--------------------+
|file:/E:/unused/U...|
+--------------------+



Scenario: need to delete the history versions of the table (According to GDPR if a person asks to delete his/her personal data from a server we need to delete it. so keeping the history versions is punishable according to law)  
can not delete history version for a single record, the versions of table as a whole should be deleted

In [37]:
spark.sql('SET spark.databricks.delta.retentionDurationCheck.enabled = false')
spark.sql('VACUUM f1_demo.drivers_merge RETAIN 0 HOURS').show()

+--------------------+
|                path|
+--------------------+
|file:/E:/unused/U...|
+--------------------+



In [None]:
spark.sql('SELECT * FROM f1_demo.drivers_merge TIMESTAMP AS OF "2024-02-10 21:05:13.505"').show()
#This will give a error as all the history versions are deletd
#but we can see the HISTORY of the table but can't query from the table

#### if deleted by mistake to restore data

In [38]:
spark.sql('DELETE FROM f1_demo.drivers_merge WHERE driverId = 1')

DataFrame[num_affected_rows: bigint]

In [39]:
spark.sql('MERGE INTO f1_demo.drivers_merge tgt USING (SELECT * FROM f1_demo.drivers_merge VERSION AS OF 3) src ON tgt.driverId = src.driverId WHEN NOT MATCHED THEN INSERT *').show()

+-----------------+----------------+----------------+-----------------+
|num_affected_rows|num_updated_rows|num_deleted_rows|num_inserted_rows|
+-----------------+----------------+----------------+-----------------+
|                1|               0|               0|                1|
+-----------------+----------------+----------------+-----------------+



## change retention period

In [5]:
delta_table = DeltaTable.forPath(spark, "E:/unused/Udemy/Spark_practice/raw/Delta lake/db/drivers_merge")
delta_table.logRetentionDuration= " 5 minutes"

Due to log entry cleanup, instances can arise where you cannot time travel to a version that is less than the retention interval. Delta Lake requires all consecutive log entries since the previous checkpoint to time travel to a particular version. For example, with a table initially consisting of log entries for versions [0, 19] and a checkpoint at verison 10, if the log entry for version 0 is cleaned up, then you cannot time travel to versions [1, 9]. Increasing the table property delta.logRetentionDuration can help avoid these situations.


Delta logs, also known as transaction logs, are created and used in Delta Lake to record all changes made to Delta tables.
When you perform operations such as inserts, updates, deletes, merges, or schema changes on a Delta table, Delta Lake generates corresponding Delta log entries.
* These log entries capture metadata about the operations, including the affected data files, the type of operation (e.g., insert, update, delete), timestamps, transaction identifiers, and other relevant information.
* Delta logs are stored as immutable, append-only files in the _delta_log directory within the Delta table's storage location.
* Write-Ahead Logging (WAL) ensures that changes are first recorded in the transaction log before being applied to the data files, ensuring durability and recoverability in case of failures.

## Convert Parquet to Delta

In [9]:
spark.sql('CREATE TABLE IF NOT EXISTS f1_demo.drivers_convert_to_delta ( driverId INT, dob DATE, forename STRING,  surname STRING, createdDate DATE,  updatedDate DATE ) USING PARQUET')

DataFrame[]

In [12]:
df = spark.read.parquet("E:/unused/Udemy/Spark_practice/raw/Delta lake/db/drivers_merge")
df.write.format('parquet').mode('overwrite').saveAsTable('f1_demo.drivers_convert_to_delta')

In [14]:
spark.sql('CONVERT TO DELTA f1_demo.drivers_convert_to_delta')

DataFrame[]

In [17]:
df = spark.table("f1_demo.drivers_convert_to_delta")
df.write.format("parquet").save("E:/unused/Udemy/Spark_practice/raw/Delta lake/db/drivers_convert_to_delta_new")

In [18]:
spark.sql('CONVERT TO DELTA parquet.`E:/unused/Udemy/Spark_practice/raw/Delta lake/db/drivers_convert_to_delta_new`')

DataFrame[]