In [3]:
# Configuring

!pip uninstall -y pyspark delta-spark
!pip install -q pyspark==3.5.1 delta-spark==3.1.0


Found existing installation: pyspark 4.0.0
Uninstalling pyspark-4.0.0:
  Successfully uninstalled pyspark-4.0.0
Found existing installation: delta-spark 4.0.0
Uninstalling delta-spark-4.0.0:
  Successfully uninstalled delta-spark-4.0.0
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.5/200.5 kB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [4]:
# Delta supported spark session
from delta import configure_spark_with_delta_pip
from pyspark.sql import SparkSession

builder = SparkSession.builder \
    .appName("DeltaLakeColab") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()


In [5]:
spark

In [6]:
csv_data = """
id,name,category,price
1,Amit,Electronics,50000
2,Priya,Furniture,3000
3,Rahul,Stationery,200
4,Neha,Books,800
5,Karthik,Electronics,45000

"""
### Save to DBFS in azure databricks notebook
### dbutils.fs.put("dbfs:/tmp/products.csv", csv_data, overwrite=True)

with open('products.csv','w')as f:
  f.write(csv_data)

In [7]:
# Create spark df
### df = spark.read.option("header", True).option("inferSchema", True).csv("dbfs:/tmp/products.csv")
df = spark.read.csv('products.csv', header=True, inferSchema=True)
df.show()

+---+-------+-----------+-----+
| id|   name|   category|price|
+---+-------+-----------+-----+
|  1|   Amit|Electronics|50000|
|  2|  Priya|  Furniture| 3000|
|  3|  Rahul| Stationery|  200|
|  4|   Neha|      Books|  800|
|  5|Karthik|Electronics|45000|
+---+-------+-----------+-----+



In [8]:
# Writing and reading in deltatable
df.write.format("delta").mode("overwrite").save("products_delta")
# Reading
df_delta = spark.read.format("delta").load('products_delta')
df_delta.show()

+---+-------+-----------+-----+
| id|   name|   category|price|
+---+-------+-----------+-----+
|  1|   Amit|Electronics|50000|
|  2|  Priya|  Furniture| 3000|
|  3|  Rahul| Stationery|  200|
|  4|   Neha|      Books|  800|
|  5|Karthik|Electronics|45000|
+---+-------+-----------+-----+



In [10]:
from delta.tables import DeltaTable

delta_table = DeltaTable.forPath(spark, "products_delta")

# Update price for product where id =2
delta_table.update(
    condition="id = 2",
    set={"price":"3500"}
)

delta_table.toDF().show()

+---+-------+-----------+-----+
| id|   name|   category|price|
+---+-------+-----------+-----+
|  1|   Amit|Electronics|50000|
|  2|  Priya|  Furniture| 3500|
|  3|  Rahul| Stationery|  200|
|  4|   Neha|      Books|  800|
|  5|Karthik|Electronics|45000|
+---+-------+-----------+-----+



In [12]:
# Update + Insert = Upsert
new_data = [
    (2, "Priya", "Furniture", 4000),     # Update
    (6, "Sneha", "Kitchen", 1200)        # Insert
]

updates_df = spark.createDataFrame(new_data, ["id", "name", "category", "price"])

# Perform Upsert
delta_table.alias("target").merge(
    updates_df.alias("source"),
    "target.id = source.id"
).whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()

# Check data again
delta_table.toDF().show()

+---+-------+-----------+-----+
| id|   name|   category|price|
+---+-------+-----------+-----+
|  1|   Amit|Electronics|50000|
|  2|  Priya|  Furniture| 4000|
|  3|  Rahul| Stationery|  200|
|  4|   Neha|      Books|  800|
|  5|Karthik|Electronics|45000|
|  6|  Sneha|    Kitchen| 1200|
+---+-------+-----------+-----+



In [14]:
# Reading versions (Time travel)

# current version
delta_table.toDF().show()

# read previous version
previous_df = spark.read.format("delta").option("versionAsOf", 0).load("products_delta")
previous_df.show()

+---+-------+-----------+-----+
| id|   name|   category|price|
+---+-------+-----------+-----+
|  1|   Amit|Electronics|50000|
|  2|  Priya|  Furniture| 4000|
|  3|  Rahul| Stationery|  200|
|  4|   Neha|      Books|  800|
|  5|Karthik|Electronics|45000|
|  6|  Sneha|    Kitchen| 1200|
+---+-------+-----------+-----+

+---+-------+-----------+-----+
| id|   name|   category|price|
+---+-------+-----------+-----+
|  1|   Amit|Electronics|50000|
|  2|  Priya|  Furniture| 3000|
|  3|  Rahul| Stationery|  200|
|  4|   Neha|      Books|  800|
|  5|Karthik|Electronics|45000|
+---+-------+-----------+-----+



In [15]:
# Partioning data

df.write.format("delta").mode("overwrite").partitionBy("category").save('/tmp/delta/products_partioned')

In [18]:
spark.read.format("delta").load("/tmp/delta/products_partioned").show()

+---+-------+-----------+-----+
| id|   name|   category|price|
+---+-------+-----------+-----+
|  1|   Amit|Electronics|50000|
|  5|Karthik|Electronics|45000|
|  3|  Rahul| Stationery|  200|
|  2|  Priya|  Furniture| 3000|
|  4|   Neha|      Books|  800|
+---+-------+-----------+-----+

