In [1]:
import os
os.environ["PYSPARK_ALLOW_INSECURE_GATEWAY"] = "1"

from pyspark.sql import SparkSession


# Inicjalizacja SparkSession
spark = (
    SparkSession.builder
    .appName("DeltaReader")
    .master("local[*]")
    .config("spark.jars", "/usr/local/spark/jars/delta-core_2.12-2.4.0.jar,/usr/local/spark/jars/delta-storage-2.4.0.jar")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .getOrCreate()
)


In [None]:
# Ścieżka do pliku Delta
SOURCE_PATH = "/home/jovyan/sink_06/delta"

# Wczytanie danych
df = spark.read.format("delta").load(SOURCE_PATH)

# Wyświetlenie schematu i danych
df.printSchema()
df.show(truncate=False)
print(df.count())

# Wersjonowanie danych

In [None]:
# Odczyt danych z wersji 0
df_v0 = spark.read.format("delta").option("versionAsOf", 1).load(SOURCE_PATH)
df_v0.show()

In [None]:
from delta.tables import DeltaTable

delta_table = DeltaTable.forPath(spark, SOURCE_PATH)
delta_table.history().show(truncate=False)

# Merge

In [10]:
updates = spark.createDataFrame([
    (5, "DeltaUser", "YOYO"),   # aktualizacja
    (1000, "DeltaUser1000", "YOYO 1000")  # dodanie nowego
], ["id", "name", "message"])

delta_table.alias("target").merge(
    updates.alias("source"),
    "target.id = source.id"
).whenMatchedUpdate(set={"target.name": "source.name", "target.message": "source.message"}) \
 .whenNotMatchedInsertAll() \
 .execute()

In [None]:
df = spark.read.format("delta").load(SOURCE_PATH)

# Danych danych po Merge
df.where("id in (5, 1000)").show(truncate=False)

# Czyszcsenie wersji

In [None]:
# Uwaga: domyślne minimum to 168 godzin (7 dni)
# Ustawienie na 0 wymaga zmiany konfiguracji Spark
spark.conf.set("spark.databricks.delta.retentionDurationCheck.enabled", "false")
delta_table.vacuum(retentionHours=0)