In [38]:
import pyspark
from delta import *

builder = pyspark.sql.SparkSession.builder.appName("LocalDelta") \
    .master("local") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.extensions", "org.elasticsearch:elasticsearch-spark-30_2.12:8.15.1") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [39]:
DRIVER_TABLE_PATH = "local_lake/silver/driver_table"
ORG_TABLE_PATH = "local_lake/bronze/organisation/"
ADDR_TABLE_PATH = "local_lake/bronze/address/"
ORG_ADDR_TABLE_PATH = "local_lake/bronze/org_addr/"

def org_write(batch_df, batch_id):

    batch_df.select('org_id').distinct().write \
        .format("delta") \
        .mode("append") \
        .save(DRIVER_TABLE_PATH)

def addr_write(batch_df, batch_id):

    batch_df.join(spark.read.format('delta').load(ORG_ADDR_TABLE_PATH), "addr_id").select("org_id").distinct().write \
        .format("delta") \
        .mode("append") \
        .save(DRIVER_TABLE_PATH)

def org_addr_write(batch_df, batch_id):
    batch_df.select('org_id').distinct().write \
        .format("delta") \
        .mode("append") \
        .save(DRIVER_TABLE_PATH)


In [40]:
org_df = spark.readStream \
    .format("delta") \
    .load(ORG_TABLE_PATH)

addr_df = spark.readStream \
    .format("delta") \
    .load(ADDR_TABLE_PATH)

org_addr_df = spark.readStream \
    .format("delta") \
    .load(ORG_ADDR_TABLE_PATH)

In [41]:

org_query = org_df.writeStream \
    .foreachBatch(org_write) \
    .option("checkpointLocation", f"{DRIVER_TABLE_PATH}_org_checkpoint") \
    .trigger(once=True) \
    .start() \
    .awaitTermination()

adddr_query = addr_df.writeStream \
    .foreachBatch(addr_write) \
    .option("checkpointLocation", f"{DRIVER_TABLE_PATH}_addr_checkpoint") \
    .trigger(once=True) \
    .start() \
    .awaitTermination()

org_addr_query = org_addr_df.writeStream \
    .foreachBatch(org_addr_write) \
    .option("checkpointLocation", f"{DRIVER_TABLE_PATH}_org_addr_checkpoint") \
    .trigger(once=True) \
    .start() \
    .awaitTermination()

In [93]:
df = spark.read.format("delta").load(DRIVER_TABLE_PATH)
df.show()

+------+
|org_id|
+------+
|    16|
|    13|
|     5|
|     2|
|     9|
|    14|
|    15|
|    14|
|    12|
|     8|
|     3|
|     9|
|     4|
|     4|
|     7|
|     6|
|     2|
|    11|
|    15|
|    12|
+------+
only showing top 20 rows

