In [28]:
import pyspark
from delta import *
from pyspark.sql.types import *
from delta.tables import *
from pyspark.sql.functions import *

In [29]:
builder = pyspark.sql.SparkSession.builder.appName("DeltaTutorial") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

In [30]:
spark = configure_spark_with_delta_pip(builder).getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

In [31]:
print("Starting Delta table creation")

data=[(1,"Anurag","Karki","Dhumbarahi",24),
      (2,"Anuska","Karki","Pragatitole",21),
      (3,"Ngawang","Gurung","Chandol",22),
      (4,"Beses","Kafle","Gatthaghar",23),
      (5,"Kalyan","Adhikari","Chabhil",26),
      (6,"Bishal","Neupane","Kapan",20)]

Starting Delta table creation


In [32]:
schema = StructType([
    StructField("Id", IntegerType(), True),
    StructField("FirstName", StringType(), True),
    StructField("LastName", StringType(), True),
    StructField("Location", StringType(), True),
    StructField("Age", IntegerType(), True)
])

In [33]:
sample_dataframe = spark.createDataFrame(data=data, schema=schema)

In [34]:
sample_dataframe.write.mode(saveMode="overwrite").format("delta").save("data/delta-table")

In [35]:
sample_dataframe.show()

+---+---------+--------+-----------+---+
| Id|FirstName|LastName|   Location|Age|
+---+---------+--------+-----------+---+
|  1|   Anurag|   Karki| Dhumbarahi| 24|
|  2|   Anuska|   Karki|Pragatitole| 21|
|  3|  Ngawang|  Gurung|    Chandol| 22|
|  4|    Beses|   Kafle| Gatthaghar| 23|
|  5|   Kalyan|Adhikari|    Chabhil| 26|
|  6|   Bishal| Neupane|      Kapan| 20|
+---+---------+--------+-----------+---+



In [36]:
sample_dataframe.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- FirstName: string (nullable = true)
 |-- LastName: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Age: integer (nullable = true)



In [37]:
deltaTable = DeltaTable.forPath(spark, "data/delta-table")

In [38]:
deltaTable.update(
    condition = expr('firstName == "Kalyan"'),
    set = {"id":lit(5),"firstName": lit("Kalyan"),"lastname":lit("Adhikaris"),"location":lit("Chabhils"),"age":lit(28)}
)

In [39]:
deltaTable.toDF().show()

+---+---------+---------+-----------+---+
| Id|FirstName| LastName|   Location|Age|
+---+---------+---------+-----------+---+
|  5|   Kalyan|Adhikaris|   Chabhils| 28|
|  2|   Anuska|    Karki|Pragatitole| 21|
|  1|   Anurag|    Karki| Dhumbarahi| 24|
|  4|    Beses|    Kafle| Gatthaghar| 23|
|  3|  Ngawang|   Gurung|    Chandol| 22|
|  6|   Bishal|  Neupane|      Kapan| 20|
+---+---------+---------+-----------+---+



In [40]:
'''upsert in deltalake
upsert  = update plus insert 
'''
data = [(1,"Ngawang","Gurung","Dallu",23),
        (2,"Bishesh","Kafle","Jorpati",24)]
schema = StructType([StructField("id",IntegerType(),True),
StructField("FirstName",StringType(),True),
StructField("LastName",StringType(),True),
StructField("Location",StringType(),True),
StructField("Age",IntegerType(),True)])

In [41]:
newData = spark.createDataFrame(data=data,schema = schema)

In [42]:
deltaTable.alias("oldData").merge(newData.alias("newData"),
                                  "oldData.FirstName = newData.FirstName")\
.whenMatchedUpdate(set={"Id":col('newData.id'),"FirstName":col('newData.FirstName'),"LastName":col('newData.LastName'),'Location':col('newData.Location'),"Age":
             col('newData.Age')})\
.whenNotMatchedInsert(values={"Id":col('newData.id'),"FirstName":col('newData.FirstName'),"LastName":col('newData.LastName'),'Location':col('newData.Location'),"Age":
             col('newData.Age')}).execute()

In [43]:
deltaTable.toDF().show()

+---+---------+---------+-----------+---+
| Id|FirstName| LastName|   Location|Age|
+---+---------+---------+-----------+---+
|  5|   Kalyan|Adhikaris|   Chabhils| 28|
|  2|   Anuska|    Karki|Pragatitole| 21|
|  1|   Anurag|    Karki| Dhumbarahi| 24|
|  4|    Beses|    Kafle| Gatthaghar| 23|
|  6|   Bishal|  Neupane|      Kapan| 20|
|  2|  Bishesh|    Kafle|    Jorpati| 24|
|  1|  Ngawang|   Gurung|      Dallu| 23|
+---+---------+---------+-----------+---+



In [44]:
print("Deleting data...!")
deltaTable = DeltaTable.forPath(spark, "data/delta-table")
deltaTable.delete(condition = expr('firstname == "Kalyan"'))

Deleting data...!


In [45]:
deltaTable.toDF().show()

+---+---------+--------+-----------+---+
| Id|FirstName|LastName|   Location|Age|
+---+---------+--------+-----------+---+
|  2|   Anuska|   Karki|Pragatitole| 21|
|  1|   Anurag|   Karki| Dhumbarahi| 24|
|  4|    Beses|   Kafle| Gatthaghar| 23|
|  6|   Bishal| Neupane|      Kapan| 20|
|  2|  Bishesh|   Kafle|    Jorpati| 24|
|  1|  Ngawang|  Gurung|      Dallu| 23|
+---+---------+--------+-----------+---+



In [46]:
df_versionzero = spark.read.format("delta").option("versionAsOf", 0).load("data/delta-table")
df_versionzero.show()

+---+---------+--------+-----------+---+
| Id|FirstName|LastName|   Location|Age|
+---+---------+--------+-----------+---+
|  2|   Anuska|   Karki|Pragatitole| 21|
|  5|   Kalyan|Adhikari|    Chabhil| 26|
|  1|   Anurag|   Karki| Dhumbarahi| 24|
|  4|    Beses|   Kafle| Gatthaghar| 23|
|  3|  Ngawang|  Gurung|    Chandol| 22|
|  6|   Bishal| Neupane|      Kapan| 20|
+---+---------+--------+-----------+---+



In [47]:
df_versionzero = spark.read.format("delta").option("versionAsOf", 1).load("data/delta-table")
df_versionzero.show()

+---+---------+---------+-----------+---+
| Id|FirstName| LastName|   Location|Age|
+---+---------+---------+-----------+---+
|  5|   Kalyan|Adhikaris|   Chabhils| 28|
|  2|   Anuska|    Karki|Pragatitole| 21|
|  1|   Anurag|    Karki| Dhumbarahi| 24|
|  4|    Beses|    Kafle| Gatthaghar| 23|
|  3|  Ngawang|   Gurung|    Chandol| 22|
|  6|   Bishal|  Neupane|      Kapan| 20|
+---+---------+---------+-----------+---+

