In [1]:
import pandas as pd
import findspark
findspark.init("/opt/manual/spark")

from pyspark.sql import SparkSession, functions as F

In [2]:
spark = (SparkSession.builder
         .appName("Delta Lake")
         .master("yarn")
         .config("spark.jars.packages", "io.delta:delta-core_2.12:0.7.0")
         .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
         .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
         .getOrCreate())

In [3]:
from delta.tables import *

In [4]:
deltaPath = "hdfs://localhost:9000/user/train/delta-ik"

In [6]:
customers = spark.createDataFrame([(10001417, "Tuncay Kavcı", "BI Developer"),
                                       (10001418, "Tülay İçtiyar", "Data Scientist"),
                                       (10004055, "Arzu Taksici", "Data Engineer"),
                                       (10001505, "Tuncay Çadırcı", "Data Analist"),
                                      (10001526, "Sultan Balcı", "Jr. Developer")],
                                       ["Id", "Name", "Branch"])

In [7]:
customers.show()

+--------+--------------+--------------+
|      Id|          Name|        Branch|
+--------+--------------+--------------+
|10001417|  Tuncay Kavcı|  BI Developer|
|10001418| Tülay İçtiyar|Data Scientist|
|10004055|  Arzu Taksici| Data Engineer|
|10001505|Tuncay Çadırcı|  Data Analist|
|10001526|  Sultan Balcı| Jr. Developer|
+--------+--------------+--------------+



# Write df as delta table

In [8]:
customers.write.format("delta").mode("overwrite").save(deltaPath)

# Read from delta

In [9]:
customers_delta = DeltaTable.forPath(spark, deltaPath)

In [10]:
customers_delta.toDF().show()

+--------+--------------+--------------+
|      Id|          Name|        Branch|
+--------+--------------+--------------+
|10004055|  Arzu Taksici| Data Engineer|
|10001505|Tuncay Çadırcı|  Data Analist|
|10001526|  Sultan Balcı| Jr. Developer|
|10001417|  Tuncay Kavcı|  BI Developer|
|10001418| Tülay İçtiyar|Data Scientist|
+--------+--------------+--------------+



# new customers

In [11]:
customer_new = spark.createDataFrame([(10001417, "Tuncay Kavcı", "BI Developer"),
                                       (10001418, "Tülay İçtiyar", "Data Scientist"),
                                       (10055, "Mehmet Taksici", "Data Engineer"),
                                       (10001505, "Tuncay Çadırcı", "Data Analist"),
                                      (101526, "Ayşe Balcı", "Jr. Developer")],
                                       ["Id", "Name", "Branch"])

In [12]:
customer_new.show()

+--------+--------------+--------------+
|      Id|          Name|        Branch|
+--------+--------------+--------------+
|10001417|  Tuncay Kavcı|  BI Developer|
|10001418| Tülay İçtiyar|Data Scientist|
|   10055|Mehmet Taksici| Data Engineer|
|10001505|Tuncay Çadırcı|  Data Analist|
|  101526|    Ayşe Balcı| Jr. Developer|
+--------+--------------+--------------+



# Upserts

In [13]:
customers_delta.alias("cust") \
.merge(customer_new.alias("cust_new"), "cust.Id == cust_new.Id") \
.whenMatchedUpdateAll() \
.whenNotMatchedInsertAll() \
.execute()

In [14]:
customers_delta.toDF().show()

+--------+--------------+--------------+
|      Id|          Name|        Branch|
+--------+--------------+--------------+
|10001418| Tülay İçtiyar|Data Scientist|
|10001505|Tuncay Çadırcı|  Data Analist|
|   10055|Mehmet Taksici| Data Engineer|
|10001526|  Sultan Balcı| Jr. Developer|
|10001417|  Tuncay Kavcı|  BI Developer|
|  101526|    Ayşe Balcı| Jr. Developer|
|10004055|  Arzu Taksici| Data Engineer|
+--------+--------------+--------------+



# Update

In [15]:
customers_delta.update(
condition=F.expr("Branch == 'Data'"),
set={"Branch": F.expr("'Jr. Data'")})

In [16]:
customers_delta.toDF().show()

+--------+--------------+--------------+
|      Id|          Name|        Branch|
+--------+--------------+--------------+
|10001418| Tülay İçtiyar|Data Scientist|
|10001505|Tuncay Çadırcı|  Data Analist|
|   10055|Mehmet Taksici| Data Engineer|
|10001526|  Sultan Balcı| Jr. Developer|
|10001417|  Tuncay Kavcı|  BI Developer|
|  101526|    Ayşe Balcı| Jr. Developer|
|10004055|  Arzu Taksici| Data Engineer|
+--------+--------------+--------------+

