Delta Live Tables


In [31]:
 #STEP 0 — Setup
# These variables store the paths for each stage of the pipeline

# Landing folders: raw files exactly as they arrive
LANDING_ORDERS    = "dbfs:/FileStore/tables/dlt/landing/orders"
LANDING_CUSTOMERS = "dbfs:/FileStore/tables/dlt/landing/customers"

# Silver folder: cleaned data in Delta format
DELTA_SILVER_PATH = "dbfs:/tmp/delta/sil_orders"

# SQL table name pointing to the silver Delta folder
DELTA_TABLE_NAME  = "sil_orders_tbl"

print("We will store data in:")
print(f"Landing Orders folder:    {LANDING_ORDERS}")
print(f"Landing Customers folder: {LANDING_CUSTOMERS}")
print(f"Silver Delta folder:      {DELTA_SILVER_PATH}")
print(f"SQL Table name:           {DELTA_TABLE_NAME}")

We will store data in:
Landing Orders folder:    dbfs:/FileStore/tables/dlt/landing/orders
Landing Customers folder: dbfs:/FileStore/tables/dlt/landing/customers
Silver Delta folder:      dbfs:/tmp/delta/sil_orders
SQL Table name:           sil_orders_tbl


In [1]:
# Install Java
!apt-get install openjdk-11-jdk-headless -qq > /dev/null

!wget -q https://archive.apache.org/dist/spark/spark-3.4.1/spark-3.4.1-bin-hadoop3.tgz
!tar xf spark-3.4.1-bin-hadoop3.tgz

# Install findspark to locate Spark easily
!pip install -q findspark

In [2]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.4.1-bin-hadoop3"

import findspark
findspark.init()

from pyspark.sql import SparkSession

# Download Delta Lake JAR
!wget -q https://repo1.maven.org/maven2/io/delta/delta-core_2.12/2.4.0/delta-core_2.12-2.4.0.jar

#spark = SparkSession.builder \
#    .appName("DeltaExample") \
#    .config("spark.jars", "/content/delta-core_2.12-2.4.0.jar") \
#    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
#    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
#    .getOrCreate()

In [3]:
# Install PySpark
!pip install --upgrade pyspark==3.5.1 delta-spark==3.1.0 findspark > /dev/null

import findspark
findspark.init()

from pyspark.sql import SparkSession

# Create Spark Session (local master so it runs without a cluster)
spark = SparkSession.builder \
    .appName("StructuredStreamingRate") \
    .master("local[*]") \
    .config("spark.sql.shuffle.partitions", "1") \
    .getOrCreate()

print("Spark with Delta ready!")


Spark with Delta ready!


In [4]:
from pyspark.sql import functions as F
from pyspark.sql import types as T

LANDING_ORDERS = "/content/landing/orders"
LANDING_CUSTOMERS = "/content/landing/customers"

print("STEP 1: Seeding inline data to landing (JSON) ...")

orders_rows = [
    (1, "C001", "2025-08-08 09:00:00", 12000, "placed"),
    (2, "C002", "2025-08-08 09:05:00",  4500, "placed"),
    (3, "C001", "2025-08-08 09:10:00", 22000, "cancelled"),
    (4, "C003", "2025-08-08 09:15:00",   800, "placed")
]
customers_rows = [
    ("C001", "Ananya", "Bengaluru"),
    ("C002", "Rahul",  "Hyderabad"),
    ("C003", "Meera",  "Pune")
]

orders_schema = T.StructType([
    T.StructField("order_id",    T.IntegerType()),
    T.StructField("customer_id", T.StringType()),
    T.StructField("order_ts",    T.StringType()),
    T.StructField("amount",      T.IntegerType()),
    T.StructField("status",      T.StringType())
])
cust_schema = T.StructType([
    T.StructField("customer_id", T.StringType()),
    T.StructField("name",        T.StringType()),
    T.StructField("city",        T.StringType())
])

orders_df = (spark.createDataFrame(orders_rows, orders_schema)
             .withColumn("order_ts", F.to_timestamp("order_ts")))
customers_df = spark.createDataFrame(customers_rows, cust_schema)

orders_df.write.mode("overwrite").json(LANDING_ORDERS)
customers_df.write.mode("overwrite").json(LANDING_CUSTOMERS)

print("✅ Seeded landing JSON:")
print(f"  {LANDING_ORDERS}")
print(f"  {LANDING_CUSTOMERS}")


STEP 1: Seeding inline data to landing (JSON) ...
✅ Seeded landing JSON:
  /content/landing/orders
  /content/landing/customers


In [5]:
print("STEP 2: BRONZE - Reading raw landing data (no transformations)")

bron_orders = spark.read.json(LANDING_ORDERS)
bron_customers = spark.read.json(LANDING_CUSTOMERS)

print("Bronze Orders - schema & sample")
bron_orders.printSchema()
bron_orders.show(truncate=False)

print("Bronze Customers - schema & sample")
bron_customers.printSchema()
bron_customers.show(truncate=False)

STEP 2: BRONZE - Reading raw landing data (no transformations)
Bronze Orders - schema & sample
root
 |-- amount: long (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- order_id: long (nullable = true)
 |-- order_ts: string (nullable = true)
 |-- status: string (nullable = true)

+------+-----------+--------+------------------------+---------+
|amount|customer_id|order_id|order_ts                |status   |
+------+-----------+--------+------------------------+---------+
|22000 |C001       |3       |2025-08-08T09:10:00.000Z|cancelled|
|800   |C003       |4       |2025-08-08T09:15:00.000Z|placed   |
|12000 |C001       |1       |2025-08-08T09:00:00.000Z|placed   |
|4500  |C002       |2       |2025-08-08T09:05:00.000Z|placed   |
+------+-----------+--------+------------------------+---------+

Bronze Customers - schema & sample
root
 |-- city: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- name: string (nullable = true)

+---------+-----------+----