In [4]:
import pyspark
from delta import *

builder = pyspark.sql.SparkSession.builder.appName("MyApp") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()


In [22]:
data = spark.range(0, 5)

data.write.format("delta").save("data/delta-table")

AnalysisException: [DELTA_PATH_EXISTS] Cannot write to already existent path file:/Users/alfio/projects/upc/BDMP1/data/delta-table without setting OVERWRITE = 'true'.

In [24]:
df = spark.read.format("delta").load("data/delta-table")
df.show()

+---+
| id|
+---+
| 32|
| 33|
| 34|
| 35|
| 36|
| 37|
| 38|
| 39|
| 47|
| 48|
| 49|
| 50|
| 51|
| 52|
| 53|
| 54|
| 10|
| 11|
| 12|
| 13|
+---+
only showing top 20 rows



In [55]:
data = spark.range(10, 20)
data.write.format("delta").mode("overwrite").save("data/delta-table")

In [56]:
from delta.tables import *
from pyspark.sql.functions import *

deltaTable = DeltaTable.forPath(spark, "data/delta-table")

# Update every even value by adding 100 to it
deltaTable.update(
  condition = expr("id % 2 == 0"),
  set = { "id": expr("id + 100") })

# Delete every even value
deltaTable.delete(condition = expr("id % 2 == 0"))

# Upsert (merge) new data
newData = spark.range(0, 20)

deltaTable.alias("oldData") \
  .merge(
    newData.alias("newData"),
    "oldData.id = newData.id") \
  .whenMatchedUpdate(set = { "id": col("newData.id") }) \
  .whenNotMatchedInsert(values = { "id": col("newData.id") }) \
  .execute()

deltaTable.toDF().show()


+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
|  9|
| 10|
| 11|
| 12|
| 13|
| 14|
| 15|
| 16|
| 17|
| 18|
| 19|
+---+



In [61]:
df = spark.read.format("delta") \
  .option("versionAsOf", 3)\
  .load("data/delta-table")

df.show()

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
|  9|
| 10|
| 11|
| 12|
| 13|
| 14|
| 15|
| 16|
| 17|
| 18|
| 19|
+---+



In [40]:
streamingDf = spark.readStream.format("rate").load()

stream = streamingDf \
  .selectExpr("value as id") \
  .writeStream.format("delta") \
  .option("checkpointLocation", "data/checkpoint") \
  .start("/data/delta-table")


25/03/23 17:42:55 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


In [45]:
stream.stop()

In [47]:
stream.

AttributeError: 'StreamingQuery' object has no attribute 'show'

In [51]:
data = DeltaTable.forPath(spark, "data/delta-table").toDF()

data.()

60

In [54]:
df = spark.read.format("delta").load("data/delta_tables/api_data")
df.show()

+----------+---+---------+-----+
|      date| id|     name|value|
+----------+---+---------+-----+
|2025-03-23|  1|Product A|  100|
|2025-03-23|  2|Product B|  200|
|2025-03-23|  3|Product C|  300|
+----------+---+---------+-----+

