In [1]:
import pandas as pd

In [2]:
import findspark
findspark.init("/opt/manual/spark")

In [3]:
from pyspark.sql import SparkSession, functions as F

In [4]:
# If you are running Spark 2.4, you have to use Delta Lake 0.6.0.
# 1.1.0 sadece Spark >= 3.2.0 ile 
spark = (SparkSession.builder
         .appName("Delta Lake Basics")
         .master("yarn")
         .config("spark.jars.packages", "io.delta:delta-core_2.12:1.0.0")
         .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
         .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
         .getOrCreate())



:: loading settings :: url = jar:file:/opt/manual/spark-3.1.1-bin-hadoop3.2/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/train/.ivy2/cache
The jars for the packages stored in: /home/train/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-4a8ff5e6-3e40-4580-9ce2-60d91c5f30d2;1.0
	confs: [default]
	found io.delta#delta-core_2.12;1.0.0 in central
	found org.antlr#antlr4;4.7 in central
	found org.antlr#antlr4-runtime;4.7 in central
	found org.antlr#antlr-runtime;3.5.2 in central
	found org.antlr#ST4;4.0.8 in central
	found org.abego.treelayout#org.abego.treelayout.core;1.0.3 in central
	found org.glassfish#javax.json;1.0.4 in central
	found com.ibm.icu#icu4j;58.2 in central
downloading https://repo1.maven.org/maven2/io/delta/delta-core_2.12/1.0.0/delta-core_2.12-1.0.0.jar ...
	[SUCCESSFUL ] io.delta#delta-core_2.12;1.0.0!delta-core_2.12.jar (706ms)
:: resolution report :: resolve 3412ms :: artifacts dl 748ms
	:: modules in use:
	com.ibm.icu#icu4j;58.2 from central in [default]
	io.delta#delta-core_2.12;1

In [5]:
# We must import delta libs after SparkSession created

from delta.tables import *

## Create a spark dataframe

In [6]:
customers = spark.createDataFrame([(1416, "Tuncay Avcı", "İnsan Kaynakları"), 
                                   (1417, "İsmail İçtüzer", "Üretim Planlama"),
                                   (1506, "Mustafa Aslan", "Halkla İlişkiler"),
                                   (1527, "Buket Durmuş", "Pazarlama"),
                                   (1525, "Saadet Yılmaz", "Teknik Koordinasyon")],
                                   ["Id","Name","Branch"])

In [7]:
customers.show()

                                                                                

+----+--------------+-------------------+
|  Id|          Name|             Branch|
+----+--------------+-------------------+
|1416|   Tuncay Avcı|   İnsan Kaynakları|
|1417|İsmail İçtüzer|    Üretim Planlama|
|1506| Mustafa Aslan|   Halkla İlişkiler|
|1527|  Buket Durmuş|          Pazarlama|
|1525| Saadet Yılmaz|Teknik Koordinasyon|
+----+--------------+-------------------+



## Write dataframe to deltalake

In [8]:
! hdfs dfs -rm -R -skipTrash /user/train/ik_delta

rm: `/user/train/ik_delta': No such file or directory


In [9]:
deltaPath = "hdfs://localhost:9000/user/train/delta/ik_delta"

In [10]:
customers.write \
.mode("overwrite") \
.format("delta") \
.save(deltaPath)

                                                                                

In [11]:
! hdfs dfs -ls /user/train/delta/ik_delta

Found 3 items
drwxr-xr-x   - train supergroup          0 2022-07-22 12:27 /user/train/delta/ik_delta/_delta_log
-rw-r--r--   1 train supergroup       1008 2022-07-22 12:27 /user/train/delta/ik_delta/part-00000-c4598889-4e44-42e9-8780-07ceb9d56648-c000.snappy.parquet
-rw-r--r--   1 train supergroup       1046 2022-07-22 12:27 /user/train/delta/ik_delta/part-00001-40d43340-babe-42ae-8690-503a9eaa5fec-c000.snappy.parquet


In [15]:
! hdfs dfs -ls /user/train/delta/ik_delta/_delta_log

Found 1 items
-rw-r--r--   1 train supergroup       1071 2022-01-09 11:17 /user/train/delta/ik_delta/_delta_log/00000000000000000000.json


In [16]:
! hdfs dfs -cat /user/train/delta/ik_delta/_delta_log/00000000000000000000.json

{"commitInfo":{"timestamp":1641716266274,"operation":"WRITE","operationParameters":{"mode":"Overwrite","partitionBy":"[]"},"isBlindAppend":false,"operationMetrics":{"numFiles":"2","numOutputBytes":"2054","numOutputRows":"5"}}}
{"protocol":{"minReaderVersion":1,"minWriterVersion":2}}
{"metaData":{"id":"e127eff0-511f-4055-9b71-766ec552ca77","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"Id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"Name\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"Branch\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1641716244601}}
{"add":{"path":"part-00000-810f23d4-912e-4838-b1ed-0434c4f69231-c000.snappy.parquet","partitionValues":{},"size":1008,"modificationTime":1641716252022,"dataChange":true}}
{"add":{"path":"part-00001-b1b9647a-3f00-46bb-8954-6dfdb06f6b32-c000.snappy.parquet","partitionV

## Read From Deltalake as DeltaTable

In [12]:
customers_delta = DeltaTable.forPath(spark, deltaPath)

In [13]:
type(customers_delta)

delta.tables.DeltaTable

In [14]:
customers_delta.toDF().show()

                                                                                

+----+--------------+-------------------+
|  Id|          Name|             Branch|
+----+--------------+-------------------+
|1506| Mustafa Aslan|   Halkla İlişkiler|
|1527|  Buket Durmuş|          Pazarlama|
|1525| Saadet Yılmaz|Teknik Koordinasyon|
|1416|   Tuncay Avcı|   İnsan Kaynakları|
|1417|İsmail İçtüzer|    Üretim Planlama|
+----+--------------+-------------------+



In [17]:
! hdfs dfs -ls /user/train/ik_delta/_delta_log

Found 1 items
-rw-r--r--   1 train supergroup       1071 2022-01-08 13:52 /user/train/ik_delta/_delta_log/00000000000000000000.json


## New persons

In [None]:
                                    ([(1416, "Tuncay Avcı", "İnsan Kaynakları"), u
                                   (1417, "İsmail İçtüzer", "Üretim Planlama"), u
                                   (1506, "Mustafa Aslan", "Halkla İlişkiler"),
                                   (1527, "Buket Durmuş", "Pazarlama"),
                                   (1525, "Saadet Yılmaz", "Teknik Koordinasyon")], u
                                   ["Id","Name","Branch"])

In [15]:
customers_new = spark.createDataFrame([(1416, "Tuncay Avcı", "Genel Koordinasyon"), 
                                   (1417, "İsmail İçtüzer", "Genel Koordinasyon"),
                                   (1508, "Mustafa Bankur", "Üretim"),
                                   (1522, "Meliha Kaya", "Pazarlama"),
                                   (1525, "Saadet Yılmaz Kaya", "Teknik Koordinasyon")],
                                   ["Id","Name","Branch"])

In [16]:
customers_new.show()

+----+------------------+-------------------+
|  Id|              Name|             Branch|
+----+------------------+-------------------+
|1416|       Tuncay Avcı| Genel Koordinasyon|
|1417|    İsmail İçtüzer| Genel Koordinasyon|
|1508|    Mustafa Bankur|             Üretim|
|1522|       Meliha Kaya|          Pazarlama|
|1525|Saadet Yılmaz Kaya|Teknik Koordinasyon|
+----+------------------+-------------------+



## Upsert new customers into delta table

In [17]:
customers_delta.alias("cust") \
.merge(customers_new.alias("cust_new"), "cust.Id = cust_new.Id") \
.whenMatchedUpdateAll() \
.whenNotMatchedInsertAll() \
.execute()

                                                                                

In [18]:
customers_delta.toDF().orderBy("Id").show()

[Stage 26:>                                                         (0 + 2) / 2]

+----+------------------+-------------------+
|  Id|              Name|             Branch|
+----+------------------+-------------------+
|1416|       Tuncay Avcı| Genel Koordinasyon|
|1417|    İsmail İçtüzer| Genel Koordinasyon|
|1506|     Mustafa Aslan|   Halkla İlişkiler|
|1508|    Mustafa Bankur|             Üretim|
|1522|       Meliha Kaya|          Pazarlama|
|1525|Saadet Yılmaz Kaya|Teknik Koordinasyon|
|1527|      Buket Durmuş|          Pazarlama|
+----+------------------+-------------------+



                                                                                

In [25]:
! hdfs dfs -ls /user/train/delta/ik_delta/_delta_log

Found 2 items
-rw-r--r--   1 train supergroup       1071 2022-01-09 11:17 /user/train/delta/ik_delta/_delta_log/00000000000000000000.json
-rw-r--r--   1 train supergroup       2340 2022-01-09 11:28 /user/train/delta/ik_delta/_delta_log/00000000000000000001.json


In [26]:
! hdfs dfs -cat /user/train/delta/ik_delta/_delta_log/00000000000000000001.json

{"commitInfo":{"timestamp":1641716885657,"operation":"MERGE","operationParameters":{"predicate":"(cust.`Id` = cust_new.`Id`)","matchedPredicates":"[{\"actionType\":\"update\"}]","notMatchedPredicates":"[{\"actionType\":\"insert\"}]"},"readVersion":0,"isBlindAppend":false,"operationMetrics":{"numTargetRowsCopied":"2","numTargetRowsDeleted":"0","numTargetFilesAdded":"8","executionTimeMs":"31969","numTargetRowsInserted":"2","scanTimeMs":"13340","numTargetRowsUpdated":"3","numOutputRows":"7","numSourceRows":"5","numTargetFilesRemoved":"2","rewriteTimeMs":"18574"}}}
{"remove":{"path":"part-00001-b1b9647a-3f00-46bb-8954-6dfdb06f6b32-c000.snappy.parquet","deletionTimestamp":1641716885560,"dataChange":true,"extendedFileMetadata":true,"partitionValues":{},"size":1046}}
{"remove":{"path":"part-00000-810f23d4-912e-4838-b1ed-0434c4f69231-c000.snappy.parquet","deletionTimestamp":1641716885657,"dataChange":true,"extendedFileMetadata":true,"partitionValues":{},"size":1008}}
{"add":{"path":"part-00

# Upsert with Multiple Conditions

In [27]:
customers_new2 = spark.createDataFrame([(1520, "Mustafa Gökçe", "Genel Koordinasyon"), 
                                       (1522, "Meliha Kaya", "Satış"),
                                        (1525, "Saadet Yılmaz", "Genel Koordinasyon"),
                                        (1416, "Tuncay Döner", "Genel Koordinasyon")],
                                   ["Id","Name","Branch"])

In [28]:
customers_delta.alias("cust") \
.merge(customers_new2.alias("cust_new2"), ("cust.Id = cust_new2.Id AND cust_new2.Branch = 'Genel Koordinasyon'") ) \
.whenMatchedUpdateAll() \
.whenNotMatchedInsertAll() \
.execute()

In [30]:
customers_delta.toDF().show()

+----+--------------+------------------+
|  Id|          Name|            Branch|
+----+--------------+------------------+
|1417|İsmail İçtüzer|Genel Koordinasyon|
|1520| Mustafa Gökçe|Genel Koordinasyon|
|1525| Saadet Yılmaz|Genel Koordinasyon|
|1506| Mustafa Aslan|  Halkla İlişkiler|
|1416|  Tuncay Döner|Genel Koordinasyon|
|1527|  Buket Durmuş|         Pazarlama|
|1508|Mustafa Bankur|            Üretim|
|1522|   Meliha Kaya|         Pazarlama|
|1522|   Meliha Kaya|             Satış|
+----+--------------+------------------+



In [31]:
! hdfs dfs -ls /user/train/ik_delta/_delta_log

ls: `/user/train/ik_delta/_delta_log': No such file or directory


# Update

In [32]:
customers_delta.update(
condition=F.expr("Id == 1527"),
    set={"Name": F.expr("'Buket Durmuş Çetinkaya'"), "Branch": F.expr("'Teknoloji'")}
)

In [33]:
customers_delta.toDF().show(truncate=False)

+----+----------------------+------------------+
|Id  |Name                  |Branch            |
+----+----------------------+------------------+
|1417|İsmail İçtüzer        |Genel Koordinasyon|
|1520|Mustafa Gökçe         |Genel Koordinasyon|
|1527|Buket Durmuş Çetinkaya|Teknoloji         |
|1525|Saadet Yılmaz         |Genel Koordinasyon|
|1506|Mustafa Aslan         |Halkla İlişkiler  |
|1416|Tuncay Döner          |Genel Koordinasyon|
|1508|Mustafa Bankur        |Üretim            |
|1522|Meliha Kaya           |Pazarlama         |
|1522|Meliha Kaya           |Satış             |
+----+----------------------+------------------+



In [34]:
! hdfs dfs -ls /user/train/delta/ik_delta/_delta_log

Found 4 items
-rw-r--r--   1 train supergroup       1071 2022-01-09 11:17 /user/train/delta/ik_delta/_delta_log/00000000000000000000.json
-rw-r--r--   1 train supergroup       2340 2022-01-09 11:28 /user/train/delta/ik_delta/_delta_log/00000000000000000001.json
-rw-r--r--   1 train supergroup       1877 2022-01-09 11:30 /user/train/delta/ik_delta/_delta_log/00000000000000000002.json
-rw-r--r--   1 train supergroup        703 2022-01-09 11:34 /user/train/delta/ik_delta/_delta_log/00000000000000000003.json


# Delete

In [35]:
customers_delta.delete(F.col("Id") == 1506)

In [36]:
customers_delta.toDF().show(truncate=False)

+----+----------------------+------------------+
|Id  |Name                  |Branch            |
+----+----------------------+------------------+
|1417|İsmail İçtüzer        |Genel Koordinasyon|
|1520|Mustafa Gökçe         |Genel Koordinasyon|
|1527|Buket Durmuş Çetinkaya|Teknoloji         |
|1525|Saadet Yılmaz         |Genel Koordinasyon|
|1416|Tuncay Döner          |Genel Koordinasyon|
|1508|Mustafa Bankur        |Üretim            |
|1522|Meliha Kaya           |Pazarlama         |
|1522|Meliha Kaya           |Satış             |
+----+----------------------+------------------+



In [37]:
! hdfs dfs -ls /user/train/delta/ik_delta/_delta_log

Found 5 items
-rw-r--r--   1 train supergroup       1071 2022-01-09 11:17 /user/train/delta/ik_delta/_delta_log/00000000000000000000.json
-rw-r--r--   1 train supergroup       2340 2022-01-09 11:28 /user/train/delta/ik_delta/_delta_log/00000000000000000001.json
-rw-r--r--   1 train supergroup       1877 2022-01-09 11:30 /user/train/delta/ik_delta/_delta_log/00000000000000000002.json
-rw-r--r--   1 train supergroup        703 2022-01-09 11:34 /user/train/delta/ik_delta/_delta_log/00000000000000000003.json
-rw-r--r--   1 train supergroup        705 2022-01-09 11:35 /user/train/delta/ik_delta/_delta_log/00000000000000000004.json


In [38]:
spark.stop()